From 8fb7ed1facddf4f9b5c3ffe4f88a8befcc95111d Mon Sep 17 00:00:00 2001 From: Fahim Dalvi Date: Sun, 17 Sep 2023 13:54:30 +0300 Subject: [PATCH] Add 'splits' API for data and extend metadata for Datasets (#219) This commit introduces a "splits" API, which allows for each dataset to encode default splits like "test" or "train", or even sub-splits like "ar", "en" for various languages. All datasets have been updated to encode this information, along with a lot more metadata like licensing, default urls, task type etc. All assets have been updated to use the new API, along with lots of new tests that automatically check for various behaviors. Finally, the commit also performs a bunch of maintenance fixes like bad filenames/data paths. * Preliminary dataset metadata finder * Fix miscellaneous issues in assets like casing and old data paths * Fix missing/spurious imports * Fix more assets, introduce ArSarcasm2, etc. * Fix more assets, add missing ArSarcasm, improve metadata finder * More minor fixes * Minor fixes * Fix ArabGend citation * Also aggregate class labels * Rename Khouja20 datasets/assets to ANS * Rename News categorization assets * Fix some citations * Extend required metadata and add tasktypes to package * Add extended metadata to datasets * Add class labels to POS datasets * Add documentation for new metadata options * Make general_args optional in tests * Add dry run option * Convert all metadata() and get_data_sample() methods to static * Fix download tests * Fix AraBench splits * Implement 'splits' concept across benchmark and datasets * Add 'splits' support for fewshot assets and tests * Update BLOOMZ assets to new 'splits' API * Fix splits in Aqmar * Fix splits in Diacritization, POS and Segmentation datasets * Update GPT35 assets to new 'splits' API * Update GPT4 ZeroShot assets to new 'splits' API * Force alignment of test/train splits in fewshots mode * Miscellaneous fixes * Update GPT4 FewShot assets to new 'splits' API * Fix AraBench HuggingFace asset * Rename `Labeling` to `SequenceLabeling` * Fix labels in default data sample * Fix fewshot tests in Petals --- ...h_Ara2Eng_Helsinki_NLP_Opus_MT_ZeroShot.py | 80 +--- .../ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py | 76 +--- assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py | 76 +--- assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py | 75 +--- assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py | 1 - assets/ar/QA/ARCD_GPT35_ZeroShot.py | 1 - assets/ar/QA/ARCD_GPT4_FewShot.py | 4 - assets/ar/QA/ARCD_GPT4_ZeroShot.py | 1 - assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py | 3 - assets/ar/QA/MLQA_GPT35_ZeroShot.py | 3 - assets/ar/QA/MLQA_GPT4_FewShot.py | 7 +- assets/ar/QA/MLQA_GPT4_ZeroShot.py | 3 - assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py | 2 +- assets/ar/QA/TyDiQA_GPT35_ZeroShot.py | 2 +- assets/ar/QA/TyDiQA_GPT4_FewShot.py | 5 +- ...T4_ZeroShot.py => TyDiQA_GPT4_ZeroShot.py} | 2 +- assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py | 1 - assets/ar/QA/XQuAD_GPT35_ZeroShot.py | 1 - ..._GPT4_FewShot.py => XQuAD_GPT4_FewShot.py} | 4 - assets/ar/QA/XQuAD_GPT4_ZeroShot.py | 1 - .../gender/ArabGend_BLOOMZ_ZeroShot.py | 3 - .../gender/ArabGend_GPT35_ZeroShot.py | 3 - .../gender/ArabGend_GPT4_ZeroShot.py | 3 - .../gender/ArapTweet_BLOOMZ_ZeroShot.py | 3 - .../gender/ArapTweet_GPT35_ZeroShot.py | 3 - .../gender/ArapTweet_GPT4_FewShot.py | 6 - .../gender/ArapTweet_GPT4_ZeroShot.py | 3 - .../location/Location_BLOOMZ_ZeroShot.py | 3 - .../location/Location_GPT35_ZeroShot.py | 3 - .../location/Location_GPT4_FewShot.py | 2 - .../location/Location_GPT4_ZeroShot.py | 3 - .../name_info/NameInfo_BLOOMZ_ZeroShot.py | 3 - .../name_info/NameInfo_GPT35_ZeroShot.py | 3 - .../name_info/NameInfo_GPT4_FewShot.py | 2 - .../name_info/NameInfo_GPT4_ZeroShot.py | 3 - .../Adult_BLOOMZ_ZeroShot.py | 3 - .../Adult_GPT35_ZeroShot.py | 3 - .../Adult_GPT4_FewShot.py | 2 - .../Adult_GPT4_ZeroShot.py | 3 - .../CT22Attentionworthy_BLOOMZ_ZeroShot.py | 4 +- .../CT22Attentionworthy_GPT35_ZeroShot.py | 4 +- .../CT22Attentionworthy_GPT4_FewShot.py | 7 +- .../CT22Attentionworthy_GPT4_ZeroShot.py | 4 +- .../CT22Checkworthiness_BLOOMZ_ZeroShot.py | 4 +- .../CT22Checkworthiness_GPT35_ZeroShot.py | 4 +- .../CT22Checkworthiness_GPT4_FewShot.py | 11 +- .../CT22Checkworthiness_GPT4_ZeroShot.py | 8 +- .../CT22Claim_BLOOMZ_ZeroShot.py | 4 +- .../CT22Claim_GPT35_ZeroShot.py | 4 +- .../claim_detection/CT22Claim_GPT4_FewShot.py | 7 +- .../CT22Claim_GPT4_ZeroShot.py | 8 +- ...ot.py => ANSFactuality_BLOOMZ_ZeroShot.py} | 7 +- ...hot.py => ANSFactuality_GPT35_ZeroShot.py} | 7 +- ...wShot.py => ANSFactuality_GPT4_FewShot.py} | 10 +- ...Shot.py => ANSFactuality_GPT4_ZeroShot.py} | 7 +- .../COVID19Factuality_BLOOMZ_ZeroShot.py | 3 - .../COVID19Factuality_GPT35_ZeroShot.py | 3 - .../COVID19Factuality_GPT4_FewShot.py | 6 - .../COVID19Factuality_GPT4_ZeroShot.py | 3 - .../UnifiedFCFactuality_BLOOMZ_ZeroShot.py | 3 - .../UnifiedFCFactuality_GPT35_ZeroShot.py | 3 - .../UnifiedFCFactuality_GPT4_FewShot.py | 2 - .../UnifiedFCFactuality_GPT4_ZeroShot.py | 3 - .../CT22Harmful_BLOOMZ_ZeroShot.py | 4 +- .../CT22Harmful_GPT35_ZeroShot.py | 4 +- .../CT22Harmful_GPT4_FewShot.py | 7 +- .../CT22Harmful_GPT4_ZeroShot.py | 8 +- .../OSACT4SubtaskB_BLOOMZ_ZeroShot.py | 3 - .../OSACT4SubtaskB_GPT35_ZeroShot.py | 3 - .../OSACT4SubtaskB_GPT4_FewShot.py | 6 - .../OSACT4SubtaskB_GPT4_ZeroShot.py | 3 - .../OSACT4SubtaskA_BLOOMZ_ZeroShot.py | 3 - .../OSACT4SubtaskA_GPT35_ZeroShot.py | 3 - .../OSACT4SubtaskA_GPT4_FewShot.py | 6 - .../OSACT4SubtaskA_GPT4_ZeroShot.py | 3 - .../WANLP22Propaganda_BLOOMZ_ZeroShot.py | 3 - .../WANLP22Propaganda_GPT35_ZeroShot.py | 3 - .../WANLP22Propaganda_GPT4_FewShot.py | 6 - .../WANLP22Propaganda_GPT4_ZeroShot.py | 3 - .../spam/Spam_BLOOMZ_ZeroShot.py | 3 - .../spam/Spam_GPT35_ZeroShot.py | 3 - .../spam/Spam_GPT4_ZeroShot.py | 3 - .../CT23Subjectivity_BLOOMZ_ZeroShot.py | 4 +- .../CT23Subjectivity_GPT35_ZeroShot.py | 4 +- .../CT23Subjectivity_GPT4_FewShot.py | 7 +- .../CT23Subjectivity_GPT4_ZeroShot.py | 8 +- .../ASND_BLOOMZ_ZeroShot.py | 9 +- .../ASND_GPT35_ZeroShot.py | 7 +- .../news_categorization/ASND_GPT4_FewShot.py | 10 +- .../news_categorization/ASND_GPT4_ZeroShot.py | 7 +- ...t.py => SANADAkhbarona_BLOOMZ_ZeroShot.py} | 7 +- ...ot.py => SANADAkhbarona_GPT35_ZeroShot.py} | 7 +- ...Shot.py => SANADAkhbarona_GPT4_FewShot.py} | 10 +- ...hot.py => SANADAkhbarona_GPT4_ZeroShot.py} | 7 +- ...t.py => SANADAlArabiya_BLOOMZ_ZeroShot.py} | 7 +- ...ot.py => SANADAlArabiya_GPT35_ZeroShot.py} | 7 +- ...Shot.py => SANADAlArabiya_GPT4_FewShot.py} | 10 +- ...hot.py => SANADAlArabiya_GPT4_ZeroShot.py} | 7 +- ...t.py => SANADAlKhaleej_BLOOMZ_ZeroShot.py} | 7 +- ...ot.py => SANADAlKhaleej_GPT35_ZeroShot.py} | 7 +- ...Shot.py => SANADAlKhaleej_GPT4_FewShot.py} | 10 +- ...hot.py => SANADAlKhaleej_GPT4_ZeroShot.py} | 7 +- .../ar/semantics/NLI/XNLI_BLOOMZ_ZeroShot.py | 1 - .../ar/semantics/NLI/XNLI_GPT35_ZeroShot.py | 1 - assets/ar/semantics/NLI/XNLI_GPT4_FewShot.py | 5 +- assets/ar/semantics/NLI/XNLI_GPT4_ZeroShot.py | 1 - .../semantics/STS/Q2QSim_BLOOMZ_ZeroShot.py | 3 - .../ar/semantics/STS/Q2QSim_GPT35_ZeroShot.py | 3 - .../ar/semantics/STS/Q2QSim_GPT4_FewShot.py | 6 - .../ar/semantics/STS/Q2QSim_GPT4_ZeroShot.py | 3 - .../STS/SemEval17T1STS_BLOOMZ_ZeroShot.py | 6 - .../STS/SemEval17T1STS_GPT35_ZeroShot.py | 6 - .../STS/SemEval17T1STS_GPT4_FewShot.py | 9 - .../STS/SemEval17T1STS_GPT4_ZeroShot.py | 6 - .../STS/SemEval17T2STS_BLOOMZ_ZeroShot.py | 6 - .../STS/SemEval17T2STS_GPT35_ZeroShot.py | 6 - .../STS/SemEval17T2STS_GPT4_FewShot.py | 9 - .../STS/SemEval17T2STS_GPT4_ZeroShot.py | 6 - .../emotion/Emotion_BLOOMZ_ZeroShot.py | 3 - .../emotion/Emotion_GPT35_ZeroShot.py | 3 - .../emotion/Emotion_GPT4_FewShot.py | 6 - .../emotion/Emotion_GPT4_ZeroShot.py | 3 - ...oshot.py => ArSarcasm2_BLOOMZ_ZeroShot.py} | 7 +- .../sarcasm/ArSarcasm2_GPT35_ZeroShot.py | 7 +- .../sarcasm/ArSarcasm2_GPT4_FewShot.py | 10 +- .../sarcasm/ArSarcasm2_GPT4_ZeroShot.py | 7 +- .../sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py | 3 - .../sarcasm/ArSarcasm_GPT35_ZeroShot.py | 3 - .../sarcasm/ArSarcasm_GPT4_FewShot.py | 6 - .../sarcasm/ArSarcasm_GPT4_ZeroShot.py | 3 - .../sentiment/ArSAS_BLOOMZ_ZeroShot.py | 3 - .../sentiment/ArSAS_GPT35_ZeroShot.py | 3 - .../sentiment/ArSAS_GPT4_FewShot.py | 6 - .../sentiment/ArSAS_GPT4_ZeroShot.py | 3 - ...roShot.py => ANSStance_BLOOMZ_ZeroShot.py} | 7 +- ...eroShot.py => ANSStance_GPT35_ZeroShot.py} | 7 +- ...4_FewShot.py => ANSStance_GPT4_FewShot.py} | 10 +- ...ZeroShot.py => ANSStance_GPT4_ZeroShot.py} | 7 +- .../UnifiedFCStance_BLOOMZ_ZeroShot.py | 3 - .../UnifiedFCStance_GPT35_ZeroShot.py | 3 - .../UnifiedFCStance_GPT4_FewShot.py | 6 - .../UnifiedFCStance_GPT4_ZeroShot.py | 3 - .../NER/ANERcorp_GPT35_ZeroShot.py | 3 - .../NER/ANERcorp_GPT4_FewShot.py | 6 - .../NER/ANERcorp_GPT4_ZeroShot.py | 3 - .../NER/Aqmar_GPT35_ZeroShot.py | 6 - .../NER/Aqmar_GPT4_FewShot.py | 13 +- .../NER/Aqmar_GPT4_ZeroShot.py | 6 - .../NER/MGBWords_GPT35_ZeroShot.py | 3 - .../NER/MGBWords_GPT4_ZeroShot.py | 3 - .../QCRIDialectalArabicPOS_GPT4_ZeroShot.py | 37 +- .../POS/QCRIDialectalArabic_GPT35_ZeroShot.py | 37 +- .../POS/QCRIDialectalArabic_GPT4_FewShot.py | 64 +--- .../POS/WikiNews_GPT35_ZeroShot.py | 3 - .../POS/WikiNews_GPT4_FewShot.py | 6 - .../POS/WikiNews_GPT4_ZeroShot.py | 3 - .../POS/XGLUE_GPT35_ZeroShot.py | 3 - .../POS/XGLUE_GPT4_FewShot.py | 7 +- .../POS/XGLUE_GPT4_ZeroShot.py | 3 - .../BibleMaghrebi_GPT35_ZeroShot.py | 36 +- .../BibleMaghrebi_GPT4_FewShot.py | 43 +-- .../BibleMaghrebi_GPT4_ZeroShot.py | 35 +- .../diacritization/WikiNews_GPT35_ZeroShot.py | 3 - .../diacritization/WikiNews_GPT4_FewShot.py | 6 - .../diacritization/WikiNews_GPT4_ZeroShot.py | 3 - .../ADI_BLOOMZ_ZeroShot.py | 3 - .../ADI_GPT35_ZeroShot.py | 3 - .../ADI_GPT4_FewShot.py | 6 +- .../ADI_GPT4_ZeroShot.py | 3 - .../QADI_BLOOMZ_ZeroShot.py | 3 - .../QADI_GPT35_ZeroShot.py | 3 - .../QADI_GPT4_ZeroShot.py | 3 - .../lemmatization/WikiNews_BLOOMZ_ZeroShot.py | 3 - .../lemmatization/WikiNews_GPT35_ZeroShot.py | 3 - .../lemmatization/WikiNews_GPT4_ZeroShot.py | 3 - .../parsing/PADT_GPT35_ZeroShot.py | 3 - .../parsing/PADT_GPT4_FewShot.py | 6 - .../parsing/PADT_GPT4_ZeroShot.py | 3 - .../QCRIDialectalArabic_GPT35_ZeroShot.py | 37 +- .../QCRIDialectalArabic_GPT4_FewShot.py | 51 +-- .../QCRIDialectalArabic_GPT4_ZeroShot.py | 38 +- .../segmentation/WikiNews_GPT35_ZeroShot.py | 3 - .../segmentation/WikiNews_GPT4_FewShot.py | 6 - .../segmentation/WikiNews_GPT4_ZeroShot.py | 3 - .../CT22Checkworthiness_BLOOMZ_ZeroShot.py | 4 +- .../CT22Checkworthiness_GPT4_FewShot.py | 7 +- .../CT22Checkworthiness_GPT4_ZeroShot.py | 4 +- .../BanglaSentiment_BLOOMZ_ZeroShot.py | 3 - .../sentiment/BanglaSentiment_GPT4_FewShot.py | 6 - .../BanglaSentiment_GPT4_ZeroShot.py | 3 - .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_GPT4_FewShot.py | 6 +- .../SemEval23T3Propaganda_GPT4_ZeroShot.py | 4 +- .../CT22Checkworthiness_BLOOMZ_ZeroShot.py | 4 +- .../CT22Checkworthiness_GPT4_FewShot.py | 7 +- .../CT22Checkworthiness_GPT4_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_GPT4_FewShot.py | 6 +- .../SemEval23T3Propaganda_GPT4_ZeroShot.py | 4 +- .../CT22Checkworthiness_BLOOMZ_ZeroShot.py | 4 +- .../CT22Checkworthiness_GPT4_FewShot.py | 7 +- .../CT22Checkworthiness_GPT4_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_GPT4_FewShot.py | 6 +- .../SemEval23T3Propaganda_GPT4_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_GPT4_FewShot.py | 6 +- .../SemEval23T3Propaganda_GPT4_ZeroShot.py | 4 +- .../CT22Checkworthiness_BLOOMZ_ZeroShot.py | 4 +- .../CT22Checkworthiness_GPT4_FewShot.py | 7 +- .../CT22Checkworthiness_GPT4_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_GPT4_FewShot.py | 6 +- .../SemEval23T3Propaganda_GPT4_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_BLOOMZ_ZeroShot.py | 4 +- .../SemEval23T3Propaganda_GPT4_FewShot.py | 6 +- .../SemEval23T3Propaganda_GPT4_ZeroShot.py | 4 +- .../CT22Checkworthiness_BLOOMZ_ZeroShot.py | 4 +- .../CT22Checkworthiness_GPT4_FewShot.py | 7 +- .../CT22Checkworthiness_GPT4_ZeroShot.py | 4 +- llmebench/benchmark.py | 258 +++++++------ llmebench/datasets/ADI.py | 27 +- llmebench/datasets/ANERcorp.py | 23 +- ...Khouja20Factuality.py => ANSFactuality.py} | 17 +- .../{Khouja20Stance.py => ANSStance.py} | 17 +- llmebench/datasets/ARCD.py | 9 + .../datasets/{NewsCatASND.py => ASND.py} | 30 +- llmebench/datasets/Adult.py | 13 +- llmebench/datasets/Aqmar.py | 29 +- llmebench/datasets/ArSAS.py | 14 +- llmebench/datasets/ArSarcasm.py | 13 +- llmebench/datasets/ArSarcasm2.py | 53 +++ llmebench/datasets/AraBench.py | 257 ++++++++++++- llmebench/datasets/ArabGend.py | 28 +- llmebench/datasets/ArapTweet.py | 11 +- llmebench/datasets/BanglaSentiment.py | 13 +- .../datasets/BibleMaghrebiDiacritization.py | 17 +- llmebench/datasets/COVID19Factuality.py | 12 +- llmebench/datasets/CT22Attentionworthy.py | 25 +- llmebench/datasets/CT22Checkworthiness.py | 35 +- llmebench/datasets/CT22Claim.py | 15 +- llmebench/datasets/CT22Harmful.py | 25 +- llmebench/datasets/CT23Subjectivity.py | 15 +- llmebench/datasets/Emotion.py | 26 +- llmebench/datasets/Location.py | 36 +- llmebench/datasets/MGBWords.py | 22 +- llmebench/datasets/MLQA.py | 11 +- llmebench/datasets/NameInfo.py | 113 +++++- llmebench/datasets/OSACT4SubtaskA.py | 14 +- llmebench/datasets/OSACT4SubtaskB.py | 14 +- llmebench/datasets/PADT.py | 11 +- llmebench/datasets/QADI.py | 33 +- llmebench/datasets/QCRIDialectalArabicPOS.py | 50 ++- .../QCRIDialectalArabicSegmentation.py | 28 +- ...{NewsCatAlKhaleej.py => SANADAkhbarona.py} | 27 +- ...{NewsCatAlArabiya.py => SANADAlArabiya.py} | 26 +- ...{NewsCatAkhbarona.py => SANADAlKhaleej.py} | 27 +- llmebench/datasets/SQuADBase.py | 3 +- llmebench/datasets/STSQ2Q.py | 12 +- llmebench/datasets/SemEval17T1STS.py | 15 +- llmebench/datasets/SemEval17T2STS.py | 15 +- llmebench/datasets/SemEval23T3Propaganda.py | 59 ++- llmebench/datasets/Spam.py | 11 +- llmebench/datasets/TyDiQA.py | 9 + llmebench/datasets/UnifiedFCFactuality.py | 13 +- llmebench/datasets/UnifiedFCStance.py | 13 +- llmebench/datasets/WANLP22Propaganda.py | 33 +- llmebench/datasets/WikiNewsDiacritization.py | 12 +- llmebench/datasets/WikiNewsLemmatization.py | 11 +- llmebench/datasets/WikiNewsPOS.py | 41 ++- llmebench/datasets/WikiNewsSegmentation.py | 13 +- llmebench/datasets/XGLUEPOS.py | 30 +- llmebench/datasets/XNLI.py | 13 +- llmebench/datasets/XQuAD.py | 9 + llmebench/datasets/__init__.py | 13 +- llmebench/datasets/dataset_base.py | 51 ++- llmebench/tasks/__init__.py | 15 + llmebench/utils.py | 84 +++++ scripts/find_dataset_metadata.py | 336 +++++++++++++++++ tests/datasets/test_download_and_caching.py | 69 ++-- tests/datasets/test_implementation.py | 18 + tests/datasets/test_metadata.py | 40 +- tests/models/test_HuggingFaceInferenceAPI.py | 4 +- tests/models/test_OpenAIModel.py | 4 +- tests/models/test_Petals.py | 4 +- tests/tasks/test_implementation.py | 9 +- tests/test_benchmark.py | 342 +++++++++++++++++- tests/test_benchmark_assets.py | 15 +- 288 files changed, 2734 insertions(+), 1618 deletions(-) rename assets/ar/QA/{TydiQA_GPT4_ZeroShot.py => TyDiQA_GPT4_ZeroShot.py} (92%) rename assets/ar/QA/{XQuaD_GPT4_FewShot.py => XQuAD_GPT4_FewShot.py} (90%) rename assets/ar/factuality_disinformation_harmful_content/factuality/{Khouja20Factuality_BLOOMZ_ZeroShot.py => ANSFactuality_BLOOMZ_ZeroShot.py} (84%) rename assets/ar/factuality_disinformation_harmful_content/factuality/{Khouja20Factuality_GPT35_ZeroShot.py => ANSFactuality_GPT35_ZeroShot.py} (80%) rename assets/ar/factuality_disinformation_harmful_content/factuality/{Khouja20Factuality_GPT4_FewShot.py => ANSFactuality_GPT4_FewShot.py} (81%) rename assets/ar/factuality_disinformation_harmful_content/factuality/{Khouja20Factuality_GPT4_ZeroShot.py => ANSFactuality_GPT4_ZeroShot.py} (83%) rename assets/ar/news_categorization/{Akhbarona_BLOOMZ_ZeroShot.py => SANADAkhbarona_BLOOMZ_ZeroShot.py} (91%) rename assets/ar/news_categorization/{Akhbarona_GPT35_ZeroShot.py => SANADAkhbarona_GPT35_ZeroShot.py} (89%) rename assets/ar/news_categorization/{Akhbarona_GPT4_FewShot.py => SANADAkhbarona_GPT4_FewShot.py} (87%) rename assets/ar/news_categorization/{Akhbarona_GPT4_ZeroShot.py => SANADAkhbarona_GPT4_ZeroShot.py} (89%) rename assets/ar/news_categorization/{AlArabiya_BLOOMZ_ZeroShot.py => SANADAlArabiya_BLOOMZ_ZeroShot.py} (90%) rename assets/ar/news_categorization/{AlArabiya_GPT35_ZeroShot.py => SANADAlArabiya_GPT35_ZeroShot.py} (88%) rename assets/ar/news_categorization/{AlArabiya_GPT4_FewShot.py => SANADAlArabiya_GPT4_FewShot.py} (87%) rename assets/ar/news_categorization/{AlArabiya_GPT4_ZeroShot.py => SANADAlArabiya_GPT4_ZeroShot.py} (89%) rename assets/ar/news_categorization/{AlKhaleej_BLOOMZ_ZeroShot.py => SANADAlKhaleej_BLOOMZ_ZeroShot.py} (90%) rename assets/ar/news_categorization/{AlKhaleej_GPT35_ZeroShot.py => SANADAlKhaleej_GPT35_ZeroShot.py} (88%) rename assets/ar/news_categorization/{AlKhaleej_GPT4_FewShot.py => SANADAlKhaleej_GPT4_FewShot.py} (87%) rename assets/ar/news_categorization/{AlKhaleej_GPT4_ZeroShot.py => SANADAlKhaleej_GPT4_ZeroShot.py} (89%) rename assets/ar/sentiment_emotion_others/sarcasm/{ArSarcasm2_BLOOMZ_Zeroshot.py => ArSarcasm2_BLOOMZ_ZeroShot.py} (81%) rename assets/ar/sentiment_emotion_others/stance_detection/{Khouja20Stance_BLOOMZ_ZeroShot.py => ANSStance_BLOOMZ_ZeroShot.py} (76%) rename assets/ar/sentiment_emotion_others/stance_detection/{Khouja20Stance_GPT35_ZeroShot.py => ANSStance_GPT35_ZeroShot.py} (79%) rename assets/ar/sentiment_emotion_others/stance_detection/{Khouja20Stance_GPT4_FewShot.py => ANSStance_GPT4_FewShot.py} (83%) rename assets/ar/sentiment_emotion_others/stance_detection/{Khouja20Stance_GPT4_ZeroShot.py => ANSStance_GPT4_ZeroShot.py} (85%) rename llmebench/datasets/{Khouja20Factuality.py => ANSFactuality.py} (78%) rename llmebench/datasets/{Khouja20Stance.py => ANSStance.py} (78%) rename llmebench/datasets/{NewsCatASND.py => ASND.py} (58%) create mode 100644 llmebench/datasets/ArSarcasm2.py rename llmebench/datasets/{NewsCatAlKhaleej.py => SANADAkhbarona.py} (57%) rename llmebench/datasets/{NewsCatAlArabiya.py => SANADAlArabiya.py} (58%) rename llmebench/datasets/{NewsCatAkhbarona.py => SANADAlKhaleej.py} (57%) create mode 100644 scripts/find_dataset_metadata.py diff --git a/assets/ar/MT/AraBench_Ara2Eng_Helsinki_NLP_Opus_MT_ZeroShot.py b/assets/ar/MT/AraBench_Ara2Eng_Helsinki_NLP_Opus_MT_ZeroShot.py index effb3ba5..9560fd36 100644 --- a/assets/ar/MT/AraBench_Ara2Eng_Helsinki_NLP_Opus_MT_ZeroShot.py +++ b/assets/ar/MT/AraBench_Ara2Eng_Helsinki_NLP_Opus_MT_ZeroShot.py @@ -4,71 +4,21 @@ def config(): - sets = [ - "bible.test.mgr.0.ma", - "bible.test.mgr.0.tn", - "bible.test.msa.0.ms", - "bible.test.msa.1.ms", - "ldc_web_eg.test.lev.0.jo", - "ldc_web_eg.test.lev.0.ps", - "ldc_web_eg.test.lev.0.sy", - "ldc_web_eg.test.mgr.0.tn", - "ldc_web_eg.test.msa.0.ms", - "ldc_web_eg.test.nil.0.eg", - "ldc_web_lv.test.lev.0.lv", - "madar.test.glf.0.iq", - "madar.test.glf.0.om", - "madar.test.glf.0.qa", - "madar.test.glf.0.sa", - "madar.test.glf.0.ye", - "madar.test.glf.1.iq", - "madar.test.glf.1.sa", - "madar.test.glf.2.iq", - "madar.test.lev.0.jo", - "madar.test.lev.0.lb", - "madar.test.lev.0.pa", - "madar.test.lev.0.sy", - "madar.test.lev.1.jo", - "madar.test.lev.1.sy", - "madar.test.mgr.0.dz", - "madar.test.mgr.0.ly", - "madar.test.mgr.0.ma", - "madar.test.mgr.0.tn", - "madar.test.mgr.1.ly", - "madar.test.mgr.1.ma", - "madar.test.mgr.1.tn", - "madar.test.msa.0.ms", - "madar.test.nil.0.eg", - "madar.test.nil.0.sd", - "madar.test.nil.1.eg", - "madar.test.nil.2.eg", - ] - - configs = [] - for testset in sets: - configs.append( - { - "name": testset, - "config": { - "dataset": AraBenchDataset, - "dataset_args": { - "src": f"{testset}.ar", - "tgt": f"{testset}.en", - }, - "task": MachineTranslationTask, - "task_args": {}, - "model": HuggingFaceInferenceAPIModel, - "model_args": { - "task_type": HuggingFaceTaskTypes.Translation, - "inference_api_url": "https://api-inference.huggingface.co/models/Helsinki-NLP/opus-mt-ar-en", - "max_tries": 5, - }, - "general_args": {"data_path": "data/MT/"}, - }, - } - ) - - return configs + return { + "dataset": AraBenchDataset, + "dataset_args": { + "src_lang": "ar", + "tgt_lang": "en", + }, + "task": MachineTranslationTask, + "task_args": {}, + "model": HuggingFaceInferenceAPIModel, + "model_args": { + "task_type": HuggingFaceTaskTypes.Translation, + "inference_api_url": "https://api-inference.huggingface.co/models/Helsinki-NLP/opus-mt-ar-en", + "max_tries": 5, + }, + } def prompt(input_sample): diff --git a/assets/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py index 756c5ffc..57ae9c89 100644 --- a/assets/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py +++ b/assets/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py @@ -4,69 +4,19 @@ def config(): - sets = [ - "bible.test.mgr.0.ma", - "bible.test.mgr.0.tn", - "bible.test.msa.0.ms", - "bible.test.msa.1.ms", - "ldc_web_eg.test.lev.0.jo", - "ldc_web_eg.test.lev.0.ps", - "ldc_web_eg.test.lev.0.sy", - "ldc_web_eg.test.mgr.0.tn", - "ldc_web_eg.test.msa.0.ms", - "ldc_web_eg.test.nil.0.eg", - "ldc_web_lv.test.lev.0.lv", - "madar.test.glf.0.iq", - "madar.test.glf.0.om", - "madar.test.glf.0.qa", - "madar.test.glf.0.sa", - "madar.test.glf.0.ye", - "madar.test.glf.1.iq", - "madar.test.glf.1.sa", - "madar.test.glf.2.iq", - "madar.test.lev.0.jo", - "madar.test.lev.0.lb", - "madar.test.lev.0.pa", - "madar.test.lev.0.sy", - "madar.test.lev.1.jo", - "madar.test.lev.1.sy", - "madar.test.mgr.0.dz", - "madar.test.mgr.0.ly", - "madar.test.mgr.0.ma", - "madar.test.mgr.0.tn", - "madar.test.mgr.1.ly", - "madar.test.mgr.1.ma", - "madar.test.mgr.1.tn", - "madar.test.msa.0.ms", - "madar.test.nil.0.eg", - "madar.test.nil.0.sd", - "madar.test.nil.1.eg", - "madar.test.nil.2.eg", - ] - - configs = [] - for testset in sets: - configs.append( - { - "name": testset, - "config": { - "dataset": AraBenchDataset, - "dataset_args": { - "src": f"{testset}.ar", - "tgt": f"{testset}.en", - }, - "task": MachineTranslationTask, - "task_args": {}, - "model": PetalsModel, - "model_args": { - "max_tries": 3, - }, - "general_args": {"data_path": "data/MT/"}, - }, - } - ) - - return configs + return { + "dataset": AraBenchDataset, + "dataset_args": { + "src_lang": "ar", + "tgt_lang": "en", + }, + "task": MachineTranslationTask, + "task_args": {}, + "model": PetalsModel, + "model_args": { + "max_tries": 3, + }, + } def prompt(input_sample): diff --git a/assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py index 85b7ed3a..ed5e4459 100644 --- a/assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py +++ b/assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py @@ -4,69 +4,19 @@ def config(): - sets = [ - "bible.test.mgr.0.ma", - "bible.test.mgr.0.tn", - "bible.test.msa.0.ms", - "bible.test.msa.1.ms", - "ldc_web_eg.test.lev.0.jo", - "ldc_web_eg.test.lev.0.ps", - "ldc_web_eg.test.lev.0.sy", - "ldc_web_eg.test.mgr.0.tn", - "ldc_web_eg.test.msa.0.ms", - "ldc_web_eg.test.nil.0.eg", - "ldc_web_lv.test.lev.0.lv", - "madar.test.glf.0.iq", - "madar.test.glf.0.om", - "madar.test.glf.0.qa", - "madar.test.glf.0.sa", - "madar.test.glf.0.ye", - "madar.test.glf.1.iq", - "madar.test.glf.1.sa", - "madar.test.glf.2.iq", - "madar.test.lev.0.jo", - "madar.test.lev.0.lb", - "madar.test.lev.0.pa", - "madar.test.lev.0.sy", - "madar.test.lev.1.jo", - "madar.test.lev.1.sy", - "madar.test.mgr.0.dz", - "madar.test.mgr.0.ly", - "madar.test.mgr.0.ma", - "madar.test.mgr.0.tn", - "madar.test.mgr.1.ly", - "madar.test.mgr.1.ma", - "madar.test.mgr.1.tn", - "madar.test.msa.0.ms", - "madar.test.nil.0.eg", - "madar.test.nil.0.sd", - "madar.test.nil.1.eg", - "madar.test.nil.2.eg", - ] - - configs = [] - for testset in sets: - configs.append( - { - "name": testset, - "config": { - "dataset": AraBenchDataset, - "dataset_args": { - "src": f"{testset}.ar", - "tgt": f"{testset}.en", - }, - "task": MachineTranslationTask, - "task_args": {}, - "model": LegacyOpenAIModel, - "model_args": { - "max_tries": 5, - }, - "general_args": {"data_path": "data/MT/"}, - }, - } - ) - - return configs + return { + "dataset": AraBenchDataset, + "dataset_args": { + "src_lang": "ar", + "tgt_lang": "en", + }, + "task": MachineTranslationTask, + "task_args": {}, + "model": LegacyOpenAIModel, + "model_args": { + "max_tries": 5, + }, + } def prompt(input_sample): diff --git a/assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py index 84879af8..4f5b4006 100644 --- a/assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py +++ b/assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py @@ -4,68 +4,19 @@ def config(): - sets = [ - "bible.test.mgr.0.ma", - "bible.test.mgr.0.tn", - "bible.test.msa.0.ms", - "bible.test.msa.1.ms", - "ldc_web_eg.test.lev.0.jo", - "ldc_web_eg.test.lev.0.ps", - "ldc_web_eg.test.lev.0.sy", - "ldc_web_eg.test.mgr.0.tn", - "ldc_web_eg.test.msa.0.ms", - "ldc_web_eg.test.nil.0.eg", - "ldc_web_lv.test.lev.0.lv", - "madar.test.glf.0.iq", - "madar.test.glf.0.om", - "madar.test.glf.0.qa", - "madar.test.glf.0.sa", - "madar.test.glf.0.ye", - "madar.test.glf.1.iq", - "madar.test.glf.1.sa", - "madar.test.glf.2.iq", - "madar.test.lev.0.jo", - "madar.test.lev.0.lb", - "madar.test.lev.0.pa", - "madar.test.lev.0.sy", - "madar.test.lev.1.jo", - "madar.test.lev.1.sy", - "madar.test.mgr.0.dz", - "madar.test.mgr.0.ly", - "madar.test.mgr.0.ma", - "madar.test.mgr.0.tn", - "madar.test.mgr.1.ly", - "madar.test.mgr.1.ma", - "madar.test.mgr.1.tn", - "madar.test.msa.0.ms", - "madar.test.nil.0.eg", - "madar.test.nil.0.sd", - "madar.test.nil.1.eg", - "madar.test.nil.2.eg", - ] - configs = [] - for testset in sets: - configs.append( - { - "name": testset, - "config": { - "dataset": AraBenchDataset, - "dataset_args": { - "src": f"{testset}.ar", - "tgt": f"{testset}.en", - }, - "task": MachineTranslationTask, - "task_args": {}, - "model": OpenAIModel, - "model_args": { - "max_tries": 5, - }, - "general_args": {"data_path": "data/MT/"}, - }, - } - ) - - return configs + return { + "dataset": AraBenchDataset, + "dataset_args": { + "src_lang": "ar", + "tgt_lang": "en", + }, + "task": MachineTranslationTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "max_tries": 5, + }, + } def prompt(input_sample): diff --git a/assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py b/assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py index a7296804..cbac6661 100644 --- a/assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py +++ b/assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 5, }, - "general_args": {"data_path": "data/QA/ARCD/arcd-test.json"}, } diff --git a/assets/ar/QA/ARCD_GPT35_ZeroShot.py b/assets/ar/QA/ARCD_GPT35_ZeroShot.py index 94f9789a..dbe6577e 100644 --- a/assets/ar/QA/ARCD_GPT35_ZeroShot.py +++ b/assets/ar/QA/ARCD_GPT35_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": {"data_path": "data/QA/ARCD/arcd-test.json"}, } diff --git a/assets/ar/QA/ARCD_GPT4_FewShot.py b/assets/ar/QA/ARCD_GPT4_FewShot.py index 4f966c59..509a771c 100644 --- a/assets/ar/QA/ARCD_GPT4_FewShot.py +++ b/assets/ar/QA/ARCD_GPT4_FewShot.py @@ -17,10 +17,6 @@ def config(): "model_args": { "max_tries": 30, }, - "general_args": { - "data_path": "data/QA/arcd/arcd-test.json", - "fewshot": {"train_data_path": "data/QA/arcd/arcd-train.json"}, - }, } diff --git a/assets/ar/QA/ARCD_GPT4_ZeroShot.py b/assets/ar/QA/ARCD_GPT4_ZeroShot.py index 664c581a..8ec2ac4e 100644 --- a/assets/ar/QA/ARCD_GPT4_ZeroShot.py +++ b/assets/ar/QA/ARCD_GPT4_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 50, }, - "general_args": {"data_path": "data/QA/arcd/arcd-test.json"}, } diff --git a/assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py b/assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py index 5e4ea988..8a97707b 100644 --- a/assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py +++ b/assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 5, }, - "general_args": { - "data_path": "data/QA/MLQA/test/test-context-ar-question-ar.json" - }, } diff --git a/assets/ar/QA/MLQA_GPT35_ZeroShot.py b/assets/ar/QA/MLQA_GPT35_ZeroShot.py index a019acc9..e2d08c66 100644 --- a/assets/ar/QA/MLQA_GPT35_ZeroShot.py +++ b/assets/ar/QA/MLQA_GPT35_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/QA/MLQA/test/test-context-ar-question-ar.json" - }, } diff --git a/assets/ar/QA/MLQA_GPT4_FewShot.py b/assets/ar/QA/MLQA_GPT4_FewShot.py index 13eefdc1..33af361f 100644 --- a/assets/ar/QA/MLQA_GPT4_FewShot.py +++ b/assets/ar/QA/MLQA_GPT4_FewShot.py @@ -17,12 +17,7 @@ def config(): "model_args": { "max_tries": 30, }, - "general_args": { - "data_path": "data/QA/MLQA/test/test-context-ar-question-ar.json", - "fewshot": { - "train_data_path": "data/QA/MLQA/dev/dev-context-ar-question-ar.json" - }, - }, + "general_args": {"fewshot": {"train_split": "dev"}}, } diff --git a/assets/ar/QA/MLQA_GPT4_ZeroShot.py b/assets/ar/QA/MLQA_GPT4_ZeroShot.py index 813c5f5f..6c5942d7 100644 --- a/assets/ar/QA/MLQA_GPT4_ZeroShot.py +++ b/assets/ar/QA/MLQA_GPT4_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 50, }, - "general_args": { - "data_path": "data/QA/MLQA/test/test-context-ar-question-ar.json" - }, } diff --git a/assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py b/assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py index 4a065a6e..f3649135 100644 --- a/assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py +++ b/assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py @@ -13,7 +13,7 @@ def config(): "model_args": { "max_tries": 5, }, - "general_args": {"data_path": "data/QA/tydiqa/tydiqa-goldp-dev-arabic.json"}, + "general_args": {"test_split": "dev"}, } diff --git a/assets/ar/QA/TyDiQA_GPT35_ZeroShot.py b/assets/ar/QA/TyDiQA_GPT35_ZeroShot.py index 1ee71123..e8703ebf 100644 --- a/assets/ar/QA/TyDiQA_GPT35_ZeroShot.py +++ b/assets/ar/QA/TyDiQA_GPT35_ZeroShot.py @@ -13,7 +13,7 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": {"data_path": "data/QA/tydiqa/tydiqa-goldp-dev-arabic.json"}, + "general_args": {"test_split": "dev"}, } diff --git a/assets/ar/QA/TyDiQA_GPT4_FewShot.py b/assets/ar/QA/TyDiQA_GPT4_FewShot.py index 464d2f32..5e01566b 100644 --- a/assets/ar/QA/TyDiQA_GPT4_FewShot.py +++ b/assets/ar/QA/TyDiQA_GPT4_FewShot.py @@ -17,10 +17,7 @@ def config(): "model_args": { "max_tries": 30, }, - "general_args": { - "data_path": "data/QA/tydiqa/tydiqa-goldp-dev-arabic.json", - "fewshot": {"train_data_path": "data/QA/arcd/arcd-train.json"}, - }, + "general_args": {"test_split": "dev"}, } diff --git a/assets/ar/QA/TydiQA_GPT4_ZeroShot.py b/assets/ar/QA/TyDiQA_GPT4_ZeroShot.py similarity index 92% rename from assets/ar/QA/TydiQA_GPT4_ZeroShot.py rename to assets/ar/QA/TyDiQA_GPT4_ZeroShot.py index 245da513..1a9d9a9a 100644 --- a/assets/ar/QA/TydiQA_GPT4_ZeroShot.py +++ b/assets/ar/QA/TyDiQA_GPT4_ZeroShot.py @@ -13,7 +13,7 @@ def config(): "model_args": { "max_tries": 50, }, - "general_args": {"data_path": "data/QA/tydiqa/tydiqa-goldp-dev-arabic.json"}, + "general_args": {"test_split": "dev"}, } diff --git a/assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py b/assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py index 9f6c76fc..4be84c72 100644 --- a/assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py +++ b/assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 5, }, - "general_args": {"data_path": "data/QA/xquad/xquad.ar.json"}, } diff --git a/assets/ar/QA/XQuAD_GPT35_ZeroShot.py b/assets/ar/QA/XQuAD_GPT35_ZeroShot.py index 872aa352..dfeb6dd4 100644 --- a/assets/ar/QA/XQuAD_GPT35_ZeroShot.py +++ b/assets/ar/QA/XQuAD_GPT35_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": {"data_path": "data/QA/xquad/xquad.ar.json"}, } diff --git a/assets/ar/QA/XQuaD_GPT4_FewShot.py b/assets/ar/QA/XQuAD_GPT4_FewShot.py similarity index 90% rename from assets/ar/QA/XQuaD_GPT4_FewShot.py rename to assets/ar/QA/XQuAD_GPT4_FewShot.py index fdcab455..ba3c0f0a 100644 --- a/assets/ar/QA/XQuaD_GPT4_FewShot.py +++ b/assets/ar/QA/XQuAD_GPT4_FewShot.py @@ -17,10 +17,6 @@ def config(): "model_args": { "max_tries": 30, }, - "general_args": { - "data_path": "data/QA/xquad/xquad.ar.json", - "fewshot": {"train_data_path": "data/QA/arcd/arcd-train.json"}, - }, } diff --git a/assets/ar/QA/XQuAD_GPT4_ZeroShot.py b/assets/ar/QA/XQuAD_GPT4_ZeroShot.py index bb83df4f..bd9a7e92 100644 --- a/assets/ar/QA/XQuAD_GPT4_ZeroShot.py +++ b/assets/ar/QA/XQuAD_GPT4_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 50, }, - "general_args": {"data_path": "data/QA/xquad/xquad.ar.json"}, } diff --git a/assets/ar/demographic_attributes/gender/ArabGend_BLOOMZ_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArabGend_BLOOMZ_ZeroShot.py index 6bd166fd..cbc10fc1 100644 --- a/assets/ar/demographic_attributes/gender/ArabGend_BLOOMZ_ZeroShot.py +++ b/assets/ar/demographic_attributes/gender/ArabGend_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["m", "f"], "max_tries": 3, }, - "general_args": { - "data_path": "data/demographic_attributes/gender/gender-test.txt" - }, } diff --git a/assets/ar/demographic_attributes/gender/ArabGend_GPT35_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArabGend_GPT35_ZeroShot.py index 4efca52b..c0211a52 100644 --- a/assets/ar/demographic_attributes/gender/ArabGend_GPT35_ZeroShot.py +++ b/assets/ar/demographic_attributes/gender/ArabGend_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["m", "f"], "max_tries": 3, }, - "general_args": { - "data_path": "data/demographic_attributes/gender/gender-test.txt" - }, } diff --git a/assets/ar/demographic_attributes/gender/ArabGend_GPT4_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArabGend_GPT4_ZeroShot.py index 2aa750b3..8b280726 100644 --- a/assets/ar/demographic_attributes/gender/ArabGend_GPT4_ZeroShot.py +++ b/assets/ar/demographic_attributes/gender/ArabGend_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["m", "f"], "max_tries": 3, }, - "general_args": { - "data_path": "data/demographic_attributes/gender/gender-test.txt" - }, } diff --git a/assets/ar/demographic_attributes/gender/ArapTweet_BLOOMZ_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_BLOOMZ_ZeroShot.py index 6041f819..2a19f636 100644 --- a/assets/ar/demographic_attributes/gender/ArapTweet_BLOOMZ_ZeroShot.py +++ b/assets/ar/demographic_attributes/gender/ArapTweet_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["Female", "Male"], "max_tries": 3, }, - "general_args": { - "data_path": "data/demographic_attributes/gender/test-ARAP-unique.txt" - }, } diff --git a/assets/ar/demographic_attributes/gender/ArapTweet_GPT35_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_GPT35_ZeroShot.py index a2aa412b..c5998e7b 100644 --- a/assets/ar/demographic_attributes/gender/ArapTweet_GPT35_ZeroShot.py +++ b/assets/ar/demographic_attributes/gender/ArapTweet_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["Female", "Male"], "max_tries": 20, }, - "general_args": { - "data_path": "data/demographic_attributes/gender/test-ARAP-unique.txt" - }, } diff --git a/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_FewShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_FewShot.py index f925c209..84794a04 100644 --- a/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_FewShot.py +++ b/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_FewShot.py @@ -14,12 +14,6 @@ def config(): "class_labels": ["Female", "Male"], "max_tries": 30, }, - "general_args": { - "data_path": "data/demographic_attributes/gender/test-ARAP-unique.txt", - "fewshot": { - "train_data_path": "data/demographic_attributes/gender/train-wajdi.tsv", - }, - }, } diff --git a/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_ZeroShot.py index 26bddddb..fcc9a789 100644 --- a/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_ZeroShot.py +++ b/assets/ar/demographic_attributes/gender/ArapTweet_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["Female", "Male"], "max_tries": 30, }, - "general_args": { - "data_path": "data/demographic_attributes/gender/test-ARAP-unique.txt" - }, } diff --git a/assets/ar/demographic_attributes/location/Location_BLOOMZ_ZeroShot.py b/assets/ar/demographic_attributes/location/Location_BLOOMZ_ZeroShot.py index 0675476d..795c72ba 100644 --- a/assets/ar/demographic_attributes/location/Location_BLOOMZ_ZeroShot.py +++ b/assets/ar/demographic_attributes/location/Location_BLOOMZ_ZeroShot.py @@ -37,9 +37,6 @@ def config(): ], "max_tries": 2, }, - "general_args": { - "data_path": "data/demographic_attributes/location/arab+others.txt" - }, } diff --git a/assets/ar/demographic_attributes/location/Location_GPT35_ZeroShot.py b/assets/ar/demographic_attributes/location/Location_GPT35_ZeroShot.py index 9c652517..b3d76ecc 100644 --- a/assets/ar/demographic_attributes/location/Location_GPT35_ZeroShot.py +++ b/assets/ar/demographic_attributes/location/Location_GPT35_ZeroShot.py @@ -37,9 +37,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/demographic_attributes/location/arab+others.txt" - }, } diff --git a/assets/ar/demographic_attributes/location/Location_GPT4_FewShot.py b/assets/ar/demographic_attributes/location/Location_GPT4_FewShot.py index 536814c6..87a4a00e 100644 --- a/assets/ar/demographic_attributes/location/Location_GPT4_FewShot.py +++ b/assets/ar/demographic_attributes/location/Location_GPT4_FewShot.py @@ -38,9 +38,7 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/demographic_attributes/location/arab+others.txt", "fewshot": { - "train_data_path": "data/demographic_attributes/location/arab+others.txt", # TODO need to change the file "deduplicate": False, }, }, diff --git a/assets/ar/demographic_attributes/location/Location_GPT4_ZeroShot.py b/assets/ar/demographic_attributes/location/Location_GPT4_ZeroShot.py index 6c5fbef2..174ac7c7 100644 --- a/assets/ar/demographic_attributes/location/Location_GPT4_ZeroShot.py +++ b/assets/ar/demographic_attributes/location/Location_GPT4_ZeroShot.py @@ -37,9 +37,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/demographic_attributes/location/arab+others.txt" - }, } diff --git a/assets/ar/demographic_attributes/name_info/NameInfo_BLOOMZ_ZeroShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_BLOOMZ_ZeroShot.py index 3e8df84a..de3fca74 100644 --- a/assets/ar/demographic_attributes/name_info/NameInfo_BLOOMZ_ZeroShot.py +++ b/assets/ar/demographic_attributes/name_info/NameInfo_BLOOMZ_ZeroShot.py @@ -117,9 +117,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/demographic_attributes/name_info/wikidata_test.txt" - }, } diff --git a/assets/ar/demographic_attributes/name_info/NameInfo_GPT35_ZeroShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_GPT35_ZeroShot.py index 12ae9484..ccc19c41 100644 --- a/assets/ar/demographic_attributes/name_info/NameInfo_GPT35_ZeroShot.py +++ b/assets/ar/demographic_attributes/name_info/NameInfo_GPT35_ZeroShot.py @@ -115,9 +115,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/demographic_attributes/name_info/wikidata_test.txt", - }, } diff --git a/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_FewShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_FewShot.py index 168c3e1b..428149c0 100644 --- a/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_FewShot.py +++ b/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_FewShot.py @@ -116,9 +116,7 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/demographic_attributes/name_info/wikidata_test.txt", "fewshot": { - "train_data_path": "data/demographic_attributes/name_info/wikidata_test.txt", # TODO need to change the file "deduplicate": False, }, }, diff --git a/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_ZeroShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_ZeroShot.py index 64069d19..6290168a 100644 --- a/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_ZeroShot.py +++ b/assets/ar/demographic_attributes/name_info/NameInfo_GPT4_ZeroShot.py @@ -115,9 +115,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/demographic_attributes/name_info/wikidata_test.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_BLOOMZ_ZeroShot.py index 2e9c7927..be1b6ccb 100644 --- a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["ADULT", "NOT_ADULT"], "max_tries": 10, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/adult/adult-test.tsv", - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT35_ZeroShot.py index ad7b3a06..506dc5de 100644 --- a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["ADULT", "NOT_ADULT"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/adult/adult-test.tsv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_FewShot.py index 16e9fbf7..d9957f51 100644 --- a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_FewShot.py @@ -15,9 +15,7 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/adult/adult-test.tsv", "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/adult/adult-train.tsv", "deduplicate": True, }, }, diff --git a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_ZeroShot.py index 3fbd08b2..73c59c32 100644 --- a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["ADULT", "NOT_ADULT"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/adult/adult-test.tsv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_BLOOMZ_ZeroShot.py index 6b83efc3..6f367c53 100644 --- a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_BLOOMZ_ZeroShot.py @@ -13,9 +13,7 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/attentionworthy/CT22_arabic_1D_attentionworthy_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT35_ZeroShot.py index 1d1295ef..b0ee4b3b 100644 --- a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT35_ZeroShot.py @@ -26,9 +26,7 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/attentionworthy/CT22_arabic_1D_attentionworthy_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_FewShot.py index e7fa70f4..5b2c311e 100644 --- a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_FewShot.py @@ -24,12 +24,7 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/attentionworthy/CT22_arabic_1D_attentionworthy_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/attentionworthy/CT22_arabic_1D_attentionworthy_train.tsv", - }, - }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_ZeroShot.py index 9eee58ed..cbcbab04 100644 --- a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_GPT4_ZeroShot.py @@ -24,9 +24,7 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/attentionworthy/CT22_arabic_1D_attentionworthy_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py index 9aaa6618..8903bf9e 100644 --- a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/arabic/CT22_arabic_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT35_ZeroShot.py index 67d59b55..23935775 100644 --- a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT35_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/arabic/CT22_arabic_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py index fa39528c..34cbf48b 100644 --- a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py @@ -1,4 +1,3 @@ -import random import re from llmebench.datasets import CT22CheckworthinessDataset @@ -6,9 +5,6 @@ from llmebench.tasks import CheckworthinessTask -random.seed(1333) - - def config(): return { "dataset": CT22CheckworthinessDataset, @@ -20,12 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/arabic/CT22_arabic_1A_checkworthy_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/checkworthyness/arabic/CT22_arabic_1A_checkworthy_train.tsv", - }, - }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py index 16b66362..1dbb01c5 100644 --- a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py @@ -1,4 +1,3 @@ -import random import re from llmebench.datasets import CT22CheckworthinessDataset @@ -6,9 +5,6 @@ from llmebench.tasks import CheckworthinessTask -random.seed(1333) - - def config(): return { "dataset": CT22CheckworthinessDataset, @@ -20,9 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/arabic/CT22_arabic_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_BLOOMZ_ZeroShot.py index bf0e1c50..63adafca 100644 --- a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_BLOOMZ_ZeroShot.py @@ -13,9 +13,7 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/claim_covid19/CT22_arabic_1B_claim_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT35_ZeroShot.py index 4006ece8..e17ca409 100644 --- a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT35_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/claim_covid19/CT22_arabic_1B_claim_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_FewShot.py index 8221f958..58a948a6 100644 --- a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_FewShot.py @@ -13,12 +13,7 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/claim_covid19/CT22_arabic_1B_claim_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/claim_covid19/CT22_arabic_1B_claim_train.tsv" - }, - }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py index eeaea109..cf07aae4 100644 --- a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.datasets import CT22ClaimDataset from llmebench.models import OpenAIModel from llmebench.tasks import CheckworthinessTask def config(): return { - "dataset": CT22CheckworthinessDataset, + "dataset": CT22ClaimDataset, "dataset_args": {}, "task": CheckworthinessTask, "task_args": {}, @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/claim_covid19/CT22_arabic_1B_claim_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_BLOOMZ_ZeroShot.py similarity index 84% rename from assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_BLOOMZ_ZeroShot.py rename to assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_BLOOMZ_ZeroShot.py index 2cec59a7..75c9b3cb 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_BLOOMZ_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import Khouja20FactualityDataset +from llmebench.datasets import ANSFactualityDataset from llmebench.models import PetalsModel from llmebench.tasks import FactualityTask def config(): return { - "dataset": Khouja20FactualityDataset, + "dataset": ANSFactualityDataset, "dataset_args": {}, "task": FactualityTask, "task_args": {}, @@ -14,9 +14,6 @@ def config(): "class_labels": ["true", "false"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/test.csv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT35_ZeroShot.py similarity index 80% rename from assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT35_ZeroShot.py rename to assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT35_ZeroShot.py index badc5a8b..4a74a43b 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT35_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import Khouja20FactualityDataset +from llmebench.datasets import ANSFactualityDataset from llmebench.models import LegacyOpenAIModel from llmebench.tasks import FactualityTask def config(): return { - "dataset": Khouja20FactualityDataset, + "dataset": ANSFactualityDataset, "dataset_args": {}, "task": FactualityTask, "task_args": {}, @@ -14,9 +14,6 @@ def config(): "class_labels": ["true", "false"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/test.csv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot.py similarity index 81% rename from assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_FewShot.py rename to assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot.py index 93af23c3..c3c6bdf5 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_FewShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import Khouja20FactualityDataset +from llmebench.datasets import ANSFactualityDataset from llmebench.models import OpenAIModel from llmebench.tasks import FactualityTask def config(): return { - "dataset": Khouja20FactualityDataset, + "dataset": ANSFactualityDataset, "dataset_args": {}, "task": FactualityTask, "task_args": {}, @@ -13,12 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/test.csv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/train.csv" - }, - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot.py similarity index 83% rename from assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_ZeroShot.py rename to assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot.py index c9d8de3c..0de85093 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/Khouja20Factuality_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_GPT4_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import Khouja20FactualityDataset +from llmebench.datasets import ANSFactualityDataset from llmebench.models import OpenAIModel from llmebench.tasks import FactualityTask def config(): return { - "dataset": Khouja20FactualityDataset, + "dataset": ANSFactualityDataset, "dataset_args": {}, "task": FactualityTask, "task_args": {}, @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/test.csv", - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py index 5fff5185..c9f5b1cd 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["yes", "no"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_covid19/covid19_infodemic_arabic_data_factuality_binary_test.tsv", - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT35_ZeroShot.py index e6e17bfa..8a752ea1 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["yes", "no"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_covid19/covid19_infodemic_arabic_data_factuality_binary_test.tsv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py index ed55abf8..6c5ea8a3 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py @@ -14,12 +14,6 @@ def config(): "class_labels": ["yes", "no"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_covid19/covid19_infodemic_arabic_data_factuality_binary_test.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/factuality_covid19/covid19_infodemic_arabic_data_factuality_binary_train.tsv", - }, - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py index 20647c5c..2ee49fe1 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["yes", "no"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_covid19/covid19_infodemic_arabic_data_factuality_binary_test.tsv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_BLOOMZ_ZeroShot.py index b24310fe..89d8a6ff 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["true", "false"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_fact_checking.tsv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT35_ZeroShot.py index 51897d49..b3dd4bec 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["true", "false"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_fact_checking.tsv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_FewShot.py index 2a0e77ab..5f24e4f3 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_FewShot.py @@ -14,9 +14,7 @@ def config(): "max_tries": 3, }, "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_fact_checking.tsv", "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/train.csv", "deduplicate": False, # N-fold evaluation }, }, diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_ZeroShot.py index 259c4b04..66f32d29 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["true", "false"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_fact_checking.tsv" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_BLOOMZ_ZeroShot.py index 137c34c7..c0e6754c 100644 --- a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_BLOOMZ_ZeroShot.py @@ -13,9 +13,7 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/harmful/CT22_arabic_1C_harmful_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT35_ZeroShot.py index 15d4ba46..20c3f5ba 100644 --- a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT35_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/harmful/CT22_arabic_1C_harmful_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot.py index d1d024f9..dfb93877 100644 --- a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_FewShot.py @@ -13,12 +13,7 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/harmful/CT22_arabic_1C_harmful_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/harmful/CT22_arabic_1C_harmful_train.tsv" - }, - }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py index 93c887cd..17587c9d 100644 --- a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py @@ -1,4 +1,3 @@ -import random import re from llmebench.datasets import CT22HarmfulDataset @@ -6,9 +5,6 @@ from llmebench.tasks import CheckworthinessTask -random.seed(1333) - - def config(): return { "dataset": CT22HarmfulDataset, @@ -20,9 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/harmful/CT22_arabic_1C_harmful_test_gold.tsv" - }, + "general_args": {"test_split": "ar"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_BLOOMZ_ZeroShot.py index 48adbe95..49b9b0c1 100644 --- a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_BLOOMZ_ZeroShot.py @@ -16,9 +16,6 @@ def config(): "class_labels": ["HS", "NOT_HS"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/hate_speech/OSACT2020-sharedTask-test-tweets-labels.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT35_ZeroShot.py index 8559f54b..87d7094e 100644 --- a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["HS", "NOT_HS"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/hate_speech/OSACT2020-sharedTask-test-tweets-labels.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_FewShot.py index cee144e2..599e87ca 100644 --- a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_FewShot.py @@ -13,12 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/hate_speech/OSACT2020-sharedTask-test-tweets-labels.txt", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/hate_speech/OSACT2020-sharedTask-train_HS.txt", # TO_DO - }, - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_ZeroShot.py index 73778c43..360c993f 100644 --- a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["HS", "NOT_HS"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/hate_speech/OSACT2020-sharedTask-test-tweets-labels.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_BLOOMZ_ZeroShot.py index ad6d1f55..5b6c9a9b 100644 --- a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["OFF", "NOT_OFF"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT35_ZeroShot.py index e13fb948..87e1f9eb 100644 --- a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["OFF", "NOT_OFF"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_FewShot.py index 54fc88a7..f8fd9370 100644 --- a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_FewShot.py @@ -14,12 +14,6 @@ def config(): "class_labels": ["OFF", "NOT_OFF"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/offensive_language/OSACT2020-sharedTask-train_OFF.txt", # TO_DO - }, - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_ZeroShot.py index 665c8cc9..5f94264a 100644 --- a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["OFF", "NOT_OFF"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_BLOOMZ_ZeroShot.py index 5e0988ec..7664c7c3 100644 --- a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_BLOOMZ_ZeroShot.py @@ -38,9 +38,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda/task1_test_gold_label_final.json" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT35_ZeroShot.py index 5ed5e64f..691cf125 100644 --- a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT35_ZeroShot.py @@ -17,9 +17,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda/task1_test_gold_label_final.json" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT4_FewShot.py index 895fe5cd..08c7fa9c 100644 --- a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT4_FewShot.py @@ -38,12 +38,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda/task1_test_gold_label_final.json", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/propaganda/task1_train.json", - }, - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT4_ZeroShot.py index 938fd8bf..e6955405 100644 --- a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22Propaganda_GPT4_ZeroShot.py @@ -38,9 +38,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda/task1_test_gold_label_final.json" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/spam/Spam_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_BLOOMZ_ZeroShot.py index e3ea01ee..ab737b1b 100644 --- a/assets/ar/factuality_disinformation_harmful_content/spam/Spam_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["__label__ADS", "__label__NOTADS"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/spam/ArabicAds-test.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT35_ZeroShot.py index b7eefb93..e2ee1c75 100644 --- a/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["__label__ADS", "__label__NOTADS"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/spam/ArabicAds-test.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT4_ZeroShot.py index 58233cbc..e0ff78a4 100644 --- a/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["__label__ADS", "__label__NOTADS"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/spam/ArabicAds-test.txt" - }, } diff --git a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_BLOOMZ_ZeroShot.py index 17867006..5164b96a 100644 --- a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_BLOOMZ_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["SUBJ", "OBJ"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/subjectivity/dev_ar.tsv" - }, + "general_args": {"test_split": "ar/dev"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT35_ZeroShot.py index c548634b..d1b0948e 100644 --- a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT35_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["SUBJ", "OBJ"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/subjectivity/dev_ar.tsv" - }, + "general_args": {"test_split": "ar/dev"}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot.py index a5bd61c9..6c5a592a 100644 --- a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_FewShot.py @@ -14,12 +14,7 @@ def config(): "class_labels": ["SUBJ", "OBJ"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/subjectivity/dev_ar.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/subjectivity/train_ar.tsv" - }, - }, + "general_args": {"test_split": "ar/dev", "fewshot": {"train_split": "ar"}}, } diff --git a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot.py index e2883e2c..8cd2bf21 100644 --- a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_GPT4_ZeroShot.py @@ -1,4 +1,3 @@ -import random import re from llmebench.datasets import CT23SubjectivityDataset @@ -6,9 +5,6 @@ from llmebench.tasks import SubjectivityTask -random.seed(1333) - - def config(): return { "dataset": CT23SubjectivityDataset, @@ -20,9 +16,7 @@ def config(): "class_labels": ["SUBJ", "OBJ"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/subjectivity/dev_ar.tsv" - }, + "general_args": {"test_split": "ar/dev"}, } diff --git a/assets/ar/news_categorization/ASND_BLOOMZ_ZeroShot.py b/assets/ar/news_categorization/ASND_BLOOMZ_ZeroShot.py index 668e0f37..a4aa3805 100644 --- a/assets/ar/news_categorization/ASND_BLOOMZ_ZeroShot.py +++ b/assets/ar/news_categorization/ASND_BLOOMZ_ZeroShot.py @@ -1,11 +1,13 @@ -from llmebench.datasets import NewsCatASNDDataset +import random + +from llmebench.datasets import ASNDDataset from llmebench.models import PetalsModel from llmebench.tasks import NewsCategorizationTask def config(): return { - "dataset": NewsCatASNDDataset, + "dataset": ASNDDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,9 +29,6 @@ def config(): ], "max_tries": 10, }, - "general_args": { - "data_path": "data/news_categorization/Arabic_Social_Media_News_Dataset_ASND/sm_news_ar_tst.csv" - }, } diff --git a/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py b/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py index 9d59648d..c87f0f8b 100644 --- a/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py +++ b/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import NewsCatASNDDataset +from llmebench.datasets import ASNDDataset from llmebench.models import LegacyOpenAIModel from llmebench.tasks import NewsCategorizationTask def config(): return { - "dataset": NewsCatASNDDataset, + "dataset": ASNDDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {"test": "useless"}, @@ -27,9 +27,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/news_categorization/Arabic_Social_Media_News_Dataset_ASND/sm_news_ar_tst.csv" - }, } diff --git a/assets/ar/news_categorization/ASND_GPT4_FewShot.py b/assets/ar/news_categorization/ASND_GPT4_FewShot.py index c14e1b96..f241f7a3 100644 --- a/assets/ar/news_categorization/ASND_GPT4_FewShot.py +++ b/assets/ar/news_categorization/ASND_GPT4_FewShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import NewsCatASNDDataset +from llmebench.datasets import ASNDDataset from llmebench.models import OpenAIModel from llmebench.tasks import NewsCategorizationTask def config(): return { - "dataset": NewsCatASNDDataset, + "dataset": ASNDDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,12 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/news_categorization/Arabic_Social_Media_News_Dataset_ASND/sm_news_ar_tst.csv", - "fewshot": { - "train_data_path": "data/news_categorization/Arabic_Social_Media_News_Dataset_ASND/sm_news_ar_trn.csv" - }, - }, } diff --git a/assets/ar/news_categorization/ASND_GPT4_ZeroShot.py b/assets/ar/news_categorization/ASND_GPT4_ZeroShot.py index ae7fab9d..a0d0bdc4 100644 --- a/assets/ar/news_categorization/ASND_GPT4_ZeroShot.py +++ b/assets/ar/news_categorization/ASND_GPT4_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import NewsCatASNDDataset +from llmebench.datasets import ASNDDataset from llmebench.models import OpenAIModel from llmebench.tasks import NewsCategorizationTask def config(): return { - "dataset": NewsCatASNDDataset, + "dataset": ASNDDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,9 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/news_categorization/Arabic_Social_Media_News_Dataset_ASND/sm_news_ar_tst.csv" - }, } diff --git a/assets/ar/news_categorization/Akhbarona_BLOOMZ_ZeroShot.py b/assets/ar/news_categorization/SANADAkhbarona_BLOOMZ_ZeroShot.py similarity index 91% rename from assets/ar/news_categorization/Akhbarona_BLOOMZ_ZeroShot.py rename to assets/ar/news_categorization/SANADAkhbarona_BLOOMZ_ZeroShot.py index ed364822..747eda43 100644 --- a/assets/ar/news_categorization/Akhbarona_BLOOMZ_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAkhbarona_BLOOMZ_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAkhbaronaDataset +from llmebench.datasets import SANADAkhbaronaDataset from llmebench.models import PetalsModel from llmebench.tasks import NewsCategorizationTask @@ -9,7 +9,7 @@ def config(): return { - "dataset": NewsCatAkhbaronaDataset, + "dataset": SANADAkhbaronaDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,9 +27,6 @@ def config(): "max_tries": 10, "max_tokens": 8000, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_akhbarona_news_cat_test.tsv" - }, } diff --git a/assets/ar/news_categorization/Akhbarona_GPT35_ZeroShot.py b/assets/ar/news_categorization/SANADAkhbarona_GPT35_ZeroShot.py similarity index 89% rename from assets/ar/news_categorization/Akhbarona_GPT35_ZeroShot.py rename to assets/ar/news_categorization/SANADAkhbarona_GPT35_ZeroShot.py index 74ee4262..4f92eca0 100644 --- a/assets/ar/news_categorization/Akhbarona_GPT35_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAkhbarona_GPT35_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAkhbaronaDataset +from llmebench.datasets import SANADAkhbaronaDataset from llmebench.models import LegacyOpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -9,7 +9,7 @@ def config(): return { - "dataset": NewsCatAkhbaronaDataset, + "dataset": SANADAkhbaronaDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -26,9 +26,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_akhbarona_news_cat_test.tsv" - }, } diff --git a/assets/ar/news_categorization/Akhbarona_GPT4_FewShot.py b/assets/ar/news_categorization/SANADAkhbarona_GPT4_FewShot.py similarity index 87% rename from assets/ar/news_categorization/Akhbarona_GPT4_FewShot.py rename to assets/ar/news_categorization/SANADAkhbarona_GPT4_FewShot.py index 160b5f5e..51146752 100644 --- a/assets/ar/news_categorization/Akhbarona_GPT4_FewShot.py +++ b/assets/ar/news_categorization/SANADAkhbarona_GPT4_FewShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAkhbaronaDataset +from llmebench.datasets import SANADAkhbaronaDataset from llmebench.models import OpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -10,7 +10,7 @@ def config(): return { - "dataset": NewsCatAkhbaronaDataset, + "dataset": SANADAkhbaronaDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,12 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_akhbarona_news_cat_test.tsv", - "fewshot": { - "train_data_path": "data/news_categorization/SANAD_akhbarona_news_cat_train.tsv" - }, - }, } diff --git a/assets/ar/news_categorization/Akhbarona_GPT4_ZeroShot.py b/assets/ar/news_categorization/SANADAkhbarona_GPT4_ZeroShot.py similarity index 89% rename from assets/ar/news_categorization/Akhbarona_GPT4_ZeroShot.py rename to assets/ar/news_categorization/SANADAkhbarona_GPT4_ZeroShot.py index b3c39b38..88b4c150 100644 --- a/assets/ar/news_categorization/Akhbarona_GPT4_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAkhbarona_GPT4_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAkhbaronaDataset +from llmebench.datasets import SANADAkhbaronaDataset from llmebench.models import OpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -10,7 +10,7 @@ def config(): return { - "dataset": NewsCatAkhbaronaDataset, + "dataset": SANADAkhbaronaDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,9 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_akhbarona_news_cat_test.tsv" - }, } diff --git a/assets/ar/news_categorization/AlArabiya_BLOOMZ_ZeroShot.py b/assets/ar/news_categorization/SANADAlArabiya_BLOOMZ_ZeroShot.py similarity index 90% rename from assets/ar/news_categorization/AlArabiya_BLOOMZ_ZeroShot.py rename to assets/ar/news_categorization/SANADAlArabiya_BLOOMZ_ZeroShot.py index aa5f409e..8394ad00 100644 --- a/assets/ar/news_categorization/AlArabiya_BLOOMZ_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAlArabiya_BLOOMZ_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.datasets import SANADAlArabiyaDataset from llmebench.models import PetalsModel from llmebench.tasks import NewsCategorizationTask @@ -9,7 +9,7 @@ def config(): return { - "dataset": NewsCatAlArabiyaDataset, + "dataset": SANADAlArabiyaDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -26,9 +26,6 @@ def config(): ], "max_tries": 10, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_alarabiya_news_cat_test.tsv" - }, } diff --git a/assets/ar/news_categorization/AlArabiya_GPT35_ZeroShot.py b/assets/ar/news_categorization/SANADAlArabiya_GPT35_ZeroShot.py similarity index 88% rename from assets/ar/news_categorization/AlArabiya_GPT35_ZeroShot.py rename to assets/ar/news_categorization/SANADAlArabiya_GPT35_ZeroShot.py index 761bd8e0..9ea97543 100644 --- a/assets/ar/news_categorization/AlArabiya_GPT35_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAlArabiya_GPT35_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.datasets import SANADAlArabiyaDataset from llmebench.models import LegacyOpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -9,7 +9,7 @@ def config(): return { - "dataset": NewsCatAlArabiyaDataset, + "dataset": SANADAlArabiyaDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -26,9 +26,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_alarabiya_news_cat_test.tsv" - }, } diff --git a/assets/ar/news_categorization/AlArabiya_GPT4_FewShot.py b/assets/ar/news_categorization/SANADAlArabiya_GPT4_FewShot.py similarity index 87% rename from assets/ar/news_categorization/AlArabiya_GPT4_FewShot.py rename to assets/ar/news_categorization/SANADAlArabiya_GPT4_FewShot.py index 1c90f1a0..cf188bf4 100644 --- a/assets/ar/news_categorization/AlArabiya_GPT4_FewShot.py +++ b/assets/ar/news_categorization/SANADAlArabiya_GPT4_FewShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.datasets import SANADAlArabiyaDataset from llmebench.models import OpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -10,7 +10,7 @@ def config(): return { - "dataset": NewsCatAlArabiyaDataset, + "dataset": SANADAlArabiyaDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,12 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_alarabiya_news_cat_test.tsv", - "fewshot": { - "train_data_path": "data/news_categorization/SANAD_alarabiya_news_cat_train.tsv" - }, - }, } diff --git a/assets/ar/news_categorization/AlArabiya_GPT4_ZeroShot.py b/assets/ar/news_categorization/SANADAlArabiya_GPT4_ZeroShot.py similarity index 89% rename from assets/ar/news_categorization/AlArabiya_GPT4_ZeroShot.py rename to assets/ar/news_categorization/SANADAlArabiya_GPT4_ZeroShot.py index 15413b7f..53b647ab 100644 --- a/assets/ar/news_categorization/AlArabiya_GPT4_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAlArabiya_GPT4_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.datasets import SANADAlArabiyaDataset from llmebench.models import OpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -10,7 +10,7 @@ def config(): return { - "dataset": NewsCatAlArabiyaDataset, + "dataset": SANADAlArabiyaDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,9 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_alarabiya_news_cat_test.tsv" - }, } diff --git a/assets/ar/news_categorization/AlKhaleej_BLOOMZ_ZeroShot.py b/assets/ar/news_categorization/SANADAlKhaleej_BLOOMZ_ZeroShot.py similarity index 90% rename from assets/ar/news_categorization/AlKhaleej_BLOOMZ_ZeroShot.py rename to assets/ar/news_categorization/SANADAlKhaleej_BLOOMZ_ZeroShot.py index fe022e0c..d9b6610a 100644 --- a/assets/ar/news_categorization/AlKhaleej_BLOOMZ_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAlKhaleej_BLOOMZ_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAlArabiyaDataset +from llmebench.datasets import SANADAlKhaleejDataset from llmebench.models import PetalsModel from llmebench.tasks import NewsCategorizationTask @@ -9,7 +9,7 @@ def config(): return { - "dataset": NewsCatAlArabiyaDataset, + "dataset": SANADAlKhaleejDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -26,9 +26,6 @@ def config(): ], "max_tries": 10, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_alarabiya_news_cat_test.tsv" - }, } diff --git a/assets/ar/news_categorization/AlKhaleej_GPT35_ZeroShot.py b/assets/ar/news_categorization/SANADAlKhaleej_GPT35_ZeroShot.py similarity index 88% rename from assets/ar/news_categorization/AlKhaleej_GPT35_ZeroShot.py rename to assets/ar/news_categorization/SANADAlKhaleej_GPT35_ZeroShot.py index ff845477..bda11d76 100644 --- a/assets/ar/news_categorization/AlKhaleej_GPT35_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAlKhaleej_GPT35_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAlKhaleejDataset +from llmebench.datasets import SANADAlKhaleejDataset from llmebench.models import LegacyOpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -9,7 +9,7 @@ def config(): return { - "dataset": NewsCatAlKhaleejDataset, + "dataset": SANADAlKhaleejDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -26,9 +26,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_alkhaleej_news_cat_test.tsv" - }, } diff --git a/assets/ar/news_categorization/AlKhaleej_GPT4_FewShot.py b/assets/ar/news_categorization/SANADAlKhaleej_GPT4_FewShot.py similarity index 87% rename from assets/ar/news_categorization/AlKhaleej_GPT4_FewShot.py rename to assets/ar/news_categorization/SANADAlKhaleej_GPT4_FewShot.py index 01f4c144..b73ced51 100644 --- a/assets/ar/news_categorization/AlKhaleej_GPT4_FewShot.py +++ b/assets/ar/news_categorization/SANADAlKhaleej_GPT4_FewShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAlKhaleejDataset +from llmebench.datasets import SANADAlKhaleejDataset from llmebench.models import OpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -10,7 +10,7 @@ def config(): return { - "dataset": NewsCatAlKhaleejDataset, + "dataset": SANADAlKhaleejDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,12 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_alkhaleej_news_cat_test.tsv", - "fewshot": { - "train_data_path": "data/news_categorization/SANAD_alkhaleej_news_cat_train.tsv" - }, - }, } diff --git a/assets/ar/news_categorization/AlKhaleej_GPT4_ZeroShot.py b/assets/ar/news_categorization/SANADAlKhaleej_GPT4_ZeroShot.py similarity index 89% rename from assets/ar/news_categorization/AlKhaleej_GPT4_ZeroShot.py rename to assets/ar/news_categorization/SANADAlKhaleej_GPT4_ZeroShot.py index f0f75760..5c07ab10 100644 --- a/assets/ar/news_categorization/AlKhaleej_GPT4_ZeroShot.py +++ b/assets/ar/news_categorization/SANADAlKhaleej_GPT4_ZeroShot.py @@ -1,6 +1,6 @@ import random -from llmebench.datasets import NewsCatAlKhaleejDataset +from llmebench.datasets import SANADAlKhaleejDataset from llmebench.models import OpenAIModel from llmebench.tasks import NewsCategorizationTask @@ -10,7 +10,7 @@ def config(): return { - "dataset": NewsCatAlKhaleejDataset, + "dataset": SANADAlKhaleejDataset, "dataset_args": {}, "task": NewsCategorizationTask, "task_args": {}, @@ -27,9 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/news_categorization/SANAD_alkhaleej_news_cat_test.tsv" - }, } diff --git a/assets/ar/semantics/NLI/XNLI_BLOOMZ_ZeroShot.py b/assets/ar/semantics/NLI/XNLI_BLOOMZ_ZeroShot.py index 4349fe98..5931154d 100644 --- a/assets/ar/semantics/NLI/XNLI_BLOOMZ_ZeroShot.py +++ b/assets/ar/semantics/NLI/XNLI_BLOOMZ_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": {"data_path": "data/XNLI/xnli.test.ar.tsv"}, } diff --git a/assets/ar/semantics/NLI/XNLI_GPT35_ZeroShot.py b/assets/ar/semantics/NLI/XNLI_GPT35_ZeroShot.py index 3695b9a6..d8d8b0c2 100644 --- a/assets/ar/semantics/NLI/XNLI_GPT35_ZeroShot.py +++ b/assets/ar/semantics/NLI/XNLI_GPT35_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": {"data_path": "data/XNLI/xnli.test.ar.tsv"}, } diff --git a/assets/ar/semantics/NLI/XNLI_GPT4_FewShot.py b/assets/ar/semantics/NLI/XNLI_GPT4_FewShot.py index e34300a0..b8e8b096 100644 --- a/assets/ar/semantics/NLI/XNLI_GPT4_FewShot.py +++ b/assets/ar/semantics/NLI/XNLI_GPT4_FewShot.py @@ -13,10 +13,7 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/XNLI/xnli.test.ar.tsv", - "fewshot": {"train_data_path": "data/XNLI/xnli.dev.tsv"}, - }, + "general_args": {"fewshot": {"train_split": "dev"}}, } diff --git a/assets/ar/semantics/NLI/XNLI_GPT4_ZeroShot.py b/assets/ar/semantics/NLI/XNLI_GPT4_ZeroShot.py index 7c81b664..8a343b11 100644 --- a/assets/ar/semantics/NLI/XNLI_GPT4_ZeroShot.py +++ b/assets/ar/semantics/NLI/XNLI_GPT4_ZeroShot.py @@ -13,7 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": {"data_path": "data/XNLI/xnli.test.ar.tsv"}, } diff --git a/assets/ar/semantics/STS/Q2QSim_BLOOMZ_ZeroShot.py b/assets/ar/semantics/STS/Q2QSim_BLOOMZ_ZeroShot.py index 4bf2d3dc..c93d6a20 100644 --- a/assets/ar/semantics/STS/Q2QSim_BLOOMZ_ZeroShot.py +++ b/assets/ar/semantics/STS/Q2QSim_BLOOMZ_ZeroShot.py @@ -15,9 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/STS/nsurl-2019-task8/test.tsv", - }, } diff --git a/assets/ar/semantics/STS/Q2QSim_GPT35_ZeroShot.py b/assets/ar/semantics/STS/Q2QSim_GPT35_ZeroShot.py index 83b3d055..855d132c 100644 --- a/assets/ar/semantics/STS/Q2QSim_GPT35_ZeroShot.py +++ b/assets/ar/semantics/STS/Q2QSim_GPT35_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/STS/nsurl-2019-task8/test.tsv", - }, } diff --git a/assets/ar/semantics/STS/Q2QSim_GPT4_FewShot.py b/assets/ar/semantics/STS/Q2QSim_GPT4_FewShot.py index 886a056a..58551c5e 100644 --- a/assets/ar/semantics/STS/Q2QSim_GPT4_FewShot.py +++ b/assets/ar/semantics/STS/Q2QSim_GPT4_FewShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/STS/nsurl-2019-task8/test.tsv", - "fewshot": { - "train_data_path": "data/STS/nsurl-2019-task8/train.tsv", - }, - }, } diff --git a/assets/ar/semantics/STS/Q2QSim_GPT4_ZeroShot.py b/assets/ar/semantics/STS/Q2QSim_GPT4_ZeroShot.py index 47165f78..186de3ba 100644 --- a/assets/ar/semantics/STS/Q2QSim_GPT4_ZeroShot.py +++ b/assets/ar/semantics/STS/Q2QSim_GPT4_ZeroShot.py @@ -15,9 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/STS/nsurl-2019-task8/test.tsv", - }, } diff --git a/assets/ar/semantics/STS/SemEval17T1STS_BLOOMZ_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T1STS_BLOOMZ_ZeroShot.py index 797c137b..87cbc658 100644 --- a/assets/ar/semantics/STS/SemEval17T1STS_BLOOMZ_ZeroShot.py +++ b/assets/ar/semantics/STS/SemEval17T1STS_BLOOMZ_ZeroShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": { - "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track1.ar-ar.txt", - "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track1.ar-ar.txt", - } - }, } diff --git a/assets/ar/semantics/STS/SemEval17T1STS_GPT35_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T1STS_GPT35_ZeroShot.py index f4e2b96d..f4792c6a 100644 --- a/assets/ar/semantics/STS/SemEval17T1STS_GPT35_ZeroShot.py +++ b/assets/ar/semantics/STS/SemEval17T1STS_GPT35_ZeroShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": { - "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track1.ar-ar.txt", - "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track1.ar-ar.txt", - } - }, } diff --git a/assets/ar/semantics/STS/SemEval17T1STS_GPT4_FewShot.py b/assets/ar/semantics/STS/SemEval17T1STS_GPT4_FewShot.py index d6361b8a..f9c3e7e4 100644 --- a/assets/ar/semantics/STS/SemEval17T1STS_GPT4_FewShot.py +++ b/assets/ar/semantics/STS/SemEval17T1STS_GPT4_FewShot.py @@ -13,15 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": { - "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track1.ar-ar.txt", - "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track1.ar-ar.txt", - }, - "fewshot": { - "train_data_path": "data/STS/semeval-2017/ar_sts_data_updated/Ar_STS/ar.STS.All.txt", - }, - }, } diff --git a/assets/ar/semantics/STS/SemEval17T1STS_GPT4_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T1STS_GPT4_ZeroShot.py index eab3f253..91ae6cab 100644 --- a/assets/ar/semantics/STS/SemEval17T1STS_GPT4_ZeroShot.py +++ b/assets/ar/semantics/STS/SemEval17T1STS_GPT4_ZeroShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": { - "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track1.ar-ar.txt", - "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track1.ar-ar.txt", - } - }, } diff --git a/assets/ar/semantics/STS/SemEval17T2STS_BLOOMZ_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T2STS_BLOOMZ_ZeroShot.py index bfd2fc47..4aebdf6f 100644 --- a/assets/ar/semantics/STS/SemEval17T2STS_BLOOMZ_ZeroShot.py +++ b/assets/ar/semantics/STS/SemEval17T2STS_BLOOMZ_ZeroShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": { - "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track2.ar-en.txt", - "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track2.ar-en.txt", - } - }, } diff --git a/assets/ar/semantics/STS/SemEval17T2STS_GPT35_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T2STS_GPT35_ZeroShot.py index 4d1f9c4d..c76d0a25 100644 --- a/assets/ar/semantics/STS/SemEval17T2STS_GPT35_ZeroShot.py +++ b/assets/ar/semantics/STS/SemEval17T2STS_GPT35_ZeroShot.py @@ -13,12 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": { - "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track2.ar-en.txt", - "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track2.ar-en.txt", - } - }, } diff --git a/assets/ar/semantics/STS/SemEval17T2STS_GPT4_FewShot.py b/assets/ar/semantics/STS/SemEval17T2STS_GPT4_FewShot.py index 54247a3a..dc252a95 100644 --- a/assets/ar/semantics/STS/SemEval17T2STS_GPT4_FewShot.py +++ b/assets/ar/semantics/STS/SemEval17T2STS_GPT4_FewShot.py @@ -13,15 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": { - "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track2.ar-en.txt", - "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track2.ar-en.txt", - }, - "fewshot": { - "train_data_path": "data/STS/semeval-2017/ar_sts_data_updated/En_Ar_STS/en_ar.STS.All.txt", - }, - }, } diff --git a/assets/ar/semantics/STS/SemEval17T2STS_GPT4_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T2STS_GPT4_ZeroShot.py index 5859aa77..05443a94 100644 --- a/assets/ar/semantics/STS/SemEval17T2STS_GPT4_ZeroShot.py +++ b/assets/ar/semantics/STS/SemEval17T2STS_GPT4_ZeroShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": { - "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track2.ar-en.txt", - "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track2.ar-en.txt", - } - }, } diff --git a/assets/ar/sentiment_emotion_others/emotion/Emotion_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_BLOOMZ_ZeroShot.py index bacd49d7..db4b6b8a 100644 --- a/assets/ar/sentiment_emotion_others/emotion/Emotion_BLOOMZ_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/emotion/Emotion_BLOOMZ_ZeroShot.py @@ -25,9 +25,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/emotion/test-gold.txt" - }, } diff --git a/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT35_ZeroShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT35_ZeroShot.py index cabb78ae..6e4db2b2 100644 --- a/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT35_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT35_ZeroShot.py @@ -25,9 +25,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/emotion/test-gold.txt" - }, } diff --git a/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_FewShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_FewShot.py index 5956d5e0..7e9bc5da 100644 --- a/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_FewShot.py +++ b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_FewShot.py @@ -25,12 +25,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/emotion/test-gold.txt", - "fewshot": { - "train_data_path": "data/sentiment_emotion_others/emotion/train.txt", - }, - }, } diff --git a/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_ZeroShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_ZeroShot.py index 483d8608..c9cd3996 100644 --- a/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/emotion/Emotion_GPT4_ZeroShot.py @@ -25,9 +25,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/emotion/test-gold.txt" - }, } diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_BLOOMZ_Zeroshot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_BLOOMZ_ZeroShot.py similarity index 81% rename from assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_BLOOMZ_Zeroshot.py rename to assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_BLOOMZ_ZeroShot.py index 626c526e..5ef7874b 100644 --- a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_BLOOMZ_Zeroshot.py +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_BLOOMZ_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import ArSarcasmDataset +from llmebench.datasets import ArSarcasm2Dataset from llmebench.models import PetalsModel from llmebench.tasks import SarcasmTask def config(): return { - "dataset": ArSarcasmDataset, + "dataset": ArSarcasm2Dataset, "dataset_args": {}, "task": SarcasmTask, "task_args": {}, @@ -14,9 +14,6 @@ def config(): "class_labels": ["TRUE", "FALSE"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm2/testing_data.csv" - }, } diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT35_ZeroShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT35_ZeroShot.py index 6e8800d6..8e3d8483 100644 --- a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT35_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT35_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import ArSarcasmDataset +from llmebench.datasets import ArSarcasm2Dataset from llmebench.models import LegacyOpenAIModel from llmebench.tasks import SarcasmTask def config(): return { - "dataset": ArSarcasmDataset, + "dataset": ArSarcasm2Dataset, "dataset_args": {}, "task": SarcasmTask, "task_args": {}, @@ -14,9 +14,6 @@ def config(): "class_labels": ["TRUE", "FALSE"], "max_tries": 1, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm2/testing_data.csv" - }, } diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_FewShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_FewShot.py index e2ffb68b..9a02878f 100644 --- a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_FewShot.py +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_FewShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import ArSarcasmDataset +from llmebench.datasets import ArSarcasm2Dataset from llmebench.models import OpenAIModel from llmebench.tasks import SarcasmTask def config(): return { - "dataset": ArSarcasmDataset, + "dataset": ArSarcasm2Dataset, "dataset_args": {}, "task": SarcasmTask, "task_args": {}, @@ -14,12 +14,6 @@ def config(): "class_labels": ["TRUE", "FALSE"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm2/testing_data.csv", - "fewshot": { - "train_data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm2/training_data.csv", - }, - }, } diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_ZeroShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_ZeroShot.py index 1bb67f11..de689177 100644 --- a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_GPT4_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import ArSarcasmDataset +from llmebench.datasets import ArSarcasm2Dataset from llmebench.models import OpenAIModel from llmebench.tasks import SarcasmTask def config(): return { - "dataset": ArSarcasmDataset, + "dataset": ArSarcasm2Dataset, "dataset_args": {}, "task": SarcasmTask, "task_args": {}, @@ -14,9 +14,6 @@ def config(): "class_labels": ["TRUE", "FALSE"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm2/testing_data.csv", - }, } diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py index 27e0310f..8f333dd7 100644 --- a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_BLOOMZ_ZeroShot.py @@ -16,9 +16,6 @@ def config(): "class_labels": ["TRUE", "FALSE"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm/ArSarcasm_test.csv" - }, } diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT35_ZeroShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT35_ZeroShot.py index 1908a9b8..7fc1ea14 100644 --- a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT35_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["TRUE", "FALSE"], "max_tries": 30, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm/ArSarcasm_test.csv" - }, } diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_FewShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_FewShot.py index 70d6ec9c..18938d7c 100644 --- a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_FewShot.py +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_FewShot.py @@ -14,12 +14,6 @@ def config(): "class_labels": ["TRUE", "FALSE"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm/ArSarcasm_test.csv", - "fewshot": { - "train_data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm/ArSarcasm_train.csv", - }, - }, } diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_ZeroShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_ZeroShot.py index 98a8eeda..edb75fa0 100644 --- a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["TRUE", "FALSE"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sarcasm/ArSarcasm/ArSarcasm_test.csv" - }, } diff --git a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_BLOOMZ_ZeroShot.py index ffaf11ec..34d09dfa 100644 --- a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_BLOOMZ_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["Positive", "Negative", "Neutral", "Mixed"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sentiment/ArSAS-test.txt" - }, } diff --git a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT35_ZeroShot.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT35_ZeroShot.py index 28b85067..fe6b72dc 100644 --- a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT35_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["Positive", "Negative", "Neutral", "Mixed"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sentiment/ArSAS-test.txt" - }, } diff --git a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_FewShot.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_FewShot.py index 2209dd98..26d4d343 100644 --- a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_FewShot.py +++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_FewShot.py @@ -16,12 +16,6 @@ def config(): "class_labels": ["Positive", "Negative", "Neutral", "Mixed"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sentiment/ArSAS-test.txt", - "fewshot": { - "train_data_path": "data/sentiment_emotion_others/sentiment/ArSAS-train.txt", - }, - }, } diff --git a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_ZeroShot.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_ZeroShot.py index c89cfef7..812018df 100644 --- a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["Positive", "Negative", "Neutral", "Mixed"], "max_tries": 3, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sentiment/ArSAS-test.txt" - }, } diff --git a/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_BLOOMZ_ZeroShot.py similarity index 76% rename from assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_BLOOMZ_ZeroShot.py rename to assets/ar/sentiment_emotion_others/stance_detection/ANSStance_BLOOMZ_ZeroShot.py index 9708d980..ae9b265c 100644 --- a/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_BLOOMZ_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_BLOOMZ_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import Khouja20StanceDataset +from llmebench.datasets import ANSStanceDataset from llmebench.models import PetalsModel from llmebench.tasks import StanceTask def config(): return { - "dataset": Khouja20StanceDataset, + "dataset": ANSStanceDataset, "dataset_args": {}, "task": StanceTask, "task_args": {}, @@ -14,9 +14,6 @@ def config(): "class_labels": ["agree", "disagree"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/test.csv" - }, } diff --git a/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT35_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT35_ZeroShot.py similarity index 79% rename from assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT35_ZeroShot.py rename to assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT35_ZeroShot.py index cc3be81d..869e097a 100644 --- a/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT35_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT35_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import Khouja20StanceDataset +from llmebench.datasets import ANSStanceDataset from llmebench.models import LegacyOpenAIModel from llmebench.tasks import StanceTask def config(): return { - "dataset": Khouja20StanceDataset, + "dataset": ANSStanceDataset, "dataset_args": {}, "task": StanceTask, "task_args": {}, @@ -14,9 +14,6 @@ def config(): "class_labels": ["agree", "disagree"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/test.csv" - }, } diff --git a/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_FewShot.py b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT4_FewShot.py similarity index 83% rename from assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_FewShot.py rename to assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT4_FewShot.py index ee5ee991..af3ca6a6 100644 --- a/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_FewShot.py +++ b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT4_FewShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import Khouja20StanceDataset +from llmebench.datasets import ANSStanceDataset from llmebench.models import OpenAIModel from llmebench.tasks import StanceTask def config(): return { - "dataset": Khouja20StanceDataset, + "dataset": ANSStanceDataset, "dataset_args": {}, "task": StanceTask, "task_args": {}, @@ -14,12 +14,6 @@ def config(): "class_labels": ["agree", "disagree"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/test.csv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/train.csv" - }, - }, } diff --git a/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT4_ZeroShot.py similarity index 85% rename from assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_ZeroShot.py rename to assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT4_ZeroShot.py index b57a122d..32ce79b8 100644 --- a/assets/ar/sentiment_emotion_others/stance_detection/Khouja20Stance_GPT4_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_GPT4_ZeroShot.py @@ -1,11 +1,11 @@ -from llmebench.datasets import Khouja20StanceDataset +from llmebench.datasets import ANSStanceDataset from llmebench.models import OpenAIModel from llmebench.tasks import StanceTask def config(): return { - "dataset": Khouja20StanceDataset, + "dataset": ANSStanceDataset, "dataset_args": {}, "task": StanceTask, "task_args": {}, @@ -14,9 +14,6 @@ def config(): "class_labels": ["agree", "disagree"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/test.csv" - }, } diff --git a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_BLOOMZ_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_BLOOMZ_ZeroShot.py index 6a476667..4837dcc7 100644 --- a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_BLOOMZ_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["agree", "disagree", "discuss", "unrelated"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_stance.jsonl" - }, } diff --git a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT35_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT35_ZeroShot.py index b7d08d07..bb0378ec 100644 --- a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT35_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT35_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["agree", "disagree", "unrelated"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_stance.jsonl" - }, } diff --git a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_FewShot.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_FewShot.py index ec10be86..66f01968 100644 --- a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_FewShot.py +++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_FewShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_stance.jsonl", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/train.csv" - }, - }, } diff --git a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_ZeroShot.py index 28fa0447..84cfcbdc 100644 --- a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_ZeroShot.py +++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_GPT4_ZeroShot.py @@ -15,9 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_stance.jsonl" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT35_ZeroShot.py index 90e264b6..a07091d5 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT35_ZeroShot.py @@ -25,9 +25,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/NER/AnerCorp/ANERCorp_CamelLab_test.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_FewShot.py index 2f1c1781..d0631c6e 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_FewShot.py @@ -25,12 +25,6 @@ def config(): ], "max_tries": 50, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/NER/AnerCorp/ANERCorp_CamelLab_test.txt", - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/NER/AnerCorp/ANERCorp_CamelLab_train.txt" - }, - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_ZeroShot.py index c257b5d8..cebccf3d 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_GPT4_ZeroShot.py @@ -25,9 +25,6 @@ def config(): ], "max_tries": 150, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/NER/AnerCorp/ANERCorp_CamelLab_test.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT35_ZeroShot.py index ea4655a5..323d125c 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT35_ZeroShot.py @@ -25,12 +25,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": { - "split": "test", - "path": "data/sequence_tagging_ner_pos_etc/NER/aqmar/AQMAR_Arabic_NER_corpus-1.0", - } - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_FewShot.py index 41efb9ff..786a530e 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_FewShot.py @@ -25,18 +25,7 @@ def config(): ], "max_tries": 50, }, - "general_args": { - "data_path": { - "split": "test", - "path": "data/sequence_tagging_ner_pos_etc/NER/aqmar/AQMAR_Arabic_NER_corpus-1.0", - }, - "fewshot": { - "train_data_path": { - "split": "dev", - "path": "data/sequence_tagging_ner_pos_etc/NER/aqmar/AQMAR_Arabic_NER_corpus-1.0", - } - }, - }, + "general_args": {"fewshot": {"train_split": "dev"}}, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_ZeroShot.py index 22ebb550..49d821d3 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_GPT4_ZeroShot.py @@ -25,12 +25,6 @@ def config(): ], "max_tries": 150, }, - "general_args": { - "data_path": { - "split": "test", - "path": "data/sequence_tagging_ner_pos_etc/NER/aqmar/AQMAR_Arabic_NER_corpus-1.0", - }, - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT35_ZeroShot.py index 642a3e85..b2edd976 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT35_ZeroShot.py @@ -26,9 +26,6 @@ def config(): ], "max_tries": 50, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/NER/mgb/MGB-words.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT4_ZeroShot.py index b3058736..3558860a 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_GPT4_ZeroShot.py @@ -26,9 +26,6 @@ def config(): ], "max_tries": 150, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/NER/mgb/MGB-words.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py index eeb6271c..e72d7899 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py @@ -94,33 +94,16 @@ def config(): - sets = [ - ("egy", "egy.pos/egy.data_5.test.src-trg.sent"), - ("glf", "glf.pos/glf.data_5.test.src-trg.sent"), - ("mgr", "mgr.pos/mgr.data_5.test.src-trg.sent"), - ("lev", "lev.pos/lev.data_5.test.src-trg.sent"), - ] - configs = [] - for name, testset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": QCRIDialectalArabicPOSDataset, - "dataset_args": {}, - "task": ArabicPOSTask, - "task_args": {}, - "model": OpenAIModel, - "model_args": { - "max_tries": 3, - }, - "general_args": { - "data_path": f"data/sequence_tagging_ner_pos_etc/POS/{testset}" - }, - }, - } - ) - return configs + return { + "dataset": QCRIDialectalArabicPOSDataset, + "dataset_args": {}, + "task": ArabicPOSTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "max_tries": 3, + }, + } def prompt(input_sample): diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT35_ZeroShot.py index 53d38fd9..1d960646 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT35_ZeroShot.py @@ -94,33 +94,16 @@ def config(): - sets = [ - ("egy", "egy.pos/egy.data_5.test.src-trg.sent"), - ("glf", "glf.pos/glf.data_5.test.src-trg.sent"), - ("mgr", "mgr.pos/mgr.data_5.test.src-trg.sent"), - ("lev", "lev.pos/lev.data_5.test.src-trg.sent"), - ] - configs = [] - for name, testset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": QCRIDialectalArabicPOSDataset, - "dataset_args": {}, - "task": ArabicPOSTask, - "task_args": {}, - "model": LegacyOpenAIModel, - "model_args": { - "max_tries": 3, - }, - "general_args": { - "data_path": f"data/sequence_tagging_ner_pos_etc/POS/{testset}" - }, - }, - } - ) - return configs + return { + "dataset": QCRIDialectalArabicPOSDataset, + "dataset_args": {}, + "task": ArabicPOSTask, + "task_args": {}, + "model": LegacyOpenAIModel, + "model_args": { + "max_tries": 3, + }, + } def prompt(input_sample): diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_FewShot.py index a95a95fb..1228f6f4 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_FewShot.py @@ -94,52 +94,26 @@ def config(): - sets = [ - ( - "egy", - "egy.pos/egy.data_5.test.src-trg.sent", - "egy.pos/egy.data_5.dev.src-trg.sent", - ), - ( - "glf", - "glf.pos/glf.data_5.test.src-trg.sent", - "glf.pos/glf.data_5.dev.src-trg.sent", - ), - ( - "mgr", - "mgr.pos/mgr.data_5.test.src-trg.sent", - "mgr.pos/mgr.data_5.dev.src-trg.sent", - ), - ( - "lev", - "lev.pos/lev.data_5.test.src-trg.sent", - "lev.pos/lev.data_5.dev.src-trg.sent", - ), - ] - configs = [] - for name, testset, devset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": QCRIDialectalArabicPOSDataset, - "dataset_args": {}, - "task": ArabicPOSTask, - "task_args": {}, - "model": OpenAIModel, - "model_args": { - "max_tries": 30, - }, - "general_args": { - "data_path": f"data/sequence_tagging_ner_pos_etc/POS/{testset}", - "fewshot": { - "train_data_path": f"data/sequence_tagging_ner_pos_etc/POS/{devset}" - }, - }, - }, + return { + "dataset": QCRIDialectalArabicPOSDataset, + "dataset_args": {}, + "task": ArabicPOSTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "max_tries": 30, + }, + "general_args": { + "fewshot": { + "train_split": [ + "glf.data_5/dev", + "lev.data_5/dev", + "egy.data_5/dev", + "mgr.data_5/dev", + ], } - ) - return configs + }, + } def few_shot_prompt(input_sample, base_prompt, examples): diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT35_ZeroShot.py index 12328a90..a9d5daf7 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT35_ZeroShot.py @@ -103,9 +103,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruth.txt.POS.tab" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_FewShot.py index b62459f5..53175a6a 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_FewShot.py @@ -103,12 +103,6 @@ def config(): "model_args": { "max_tries": 30, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruth.txt.POS.tab", - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruthDev.txt" - }, - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_ZeroShot.py index 468bd7b6..eb884752 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_GPT4_ZeroShot.py @@ -103,9 +103,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruth.txt.POS.tab" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT35_ZeroShot.py index 9a244f2a..af101a61 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT35_ZeroShot.py @@ -103,9 +103,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.test.src-trg.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_FewShot.py index 89ac6b8e..59770b0f 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_FewShot.py @@ -103,12 +103,7 @@ def config(): "model_args": { "max_tries": 30, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.test.src-trg.txt", - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.dev.src-trg.txt" - }, - }, + "general_args": {"fewshot": {"train_split": "dev"}}, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_ZeroShot.py index 779d7b85..a98c56bc 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_GPT4_ZeroShot.py @@ -103,9 +103,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.test.src-trg.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT35_ZeroShot.py index dba58e89..4430e081 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT35_ZeroShot.py @@ -4,32 +4,16 @@ def config(): - sets = [ - ("mor", "morrocan_f05.test.src-tgt.txt"), - ("tun", "tunisian_f05.test.src-tgt.txt"), - ] - configs = [] - for name, testset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": BibleMaghrebiDiacritizationDataset, - "dataset_args": {}, - "task": ArabicDiacritizationTask, - "task_args": {}, - "model": LegacyOpenAIModel, - "model_args": { - "max_tries": 3, - }, - "general_args": { - "data_path": f"data/sequence_tagging_ner_pos_etc/diacritization/{testset}" - }, - }, - } - ) - - return configs + return { + "dataset": BibleMaghrebiDiacritizationDataset, + "dataset_args": {}, + "task": ArabicDiacritizationTask, + "task_args": {}, + "model": LegacyOpenAIModel, + "model_args": { + "max_tries": 3, + }, + } def prompt(input_sample): diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_FewShot.py index ee6f5ebe..11583848 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_FewShot.py @@ -4,36 +4,21 @@ def config(): - sets = [ - ("mor", "morrocan_f05.test.src-trg.txt", "morrocan_f05.dev.src-trg.txt"), - ("tun", "tunisian_f05.test.src-trg.txt", "tunisian_f05.dev.src-trg.txt"), - ] - configs = [] - for name, testset, devset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": BibleMaghrebiDiacritizationDataset, - "dataset_args": {}, - "task": ArabicDiacritizationTask, - "task_args": {}, - "model": OpenAIModel, - "model_args": { - "max_tries": 3, - }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/diacritization/" - + testset, - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/diacritization/" - + devset - }, - }, - }, + return { + "dataset": BibleMaghrebiDiacritizationDataset, + "dataset_args": {}, + "task": ArabicDiacritizationTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "max_tries": 3, + }, + "general_args": { + "fewshot": { + "train_split": ["morrocan_f05/dev", "tunisian_f05/dev"], } - ) - return configs + }, + } def few_shot_prompt(input_sample, base_prompt, examples): diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_ZeroShot.py index dd34688e..0cba7910 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_GPT4_ZeroShot.py @@ -4,31 +4,16 @@ def config(): - sets = [ - ("mor", "morrocan_f05.test.src-tgt.txt"), - ("tun", "tunisian_f05.test.src-tgt.txt"), - ] - configs = [] - for name, testset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": BibleMaghrebiDiacritizationDataset, - "dataset_args": {}, - "task": ArabicDiacritizationTask, - "task_args": {}, - "model": OpenAIModel, - "model_args": { - "max_tries": 3, - }, - "general_args": { - "data_path": f"data/sequence_tagging_ner_pos_etc/diacritization/{testset}" - }, - }, - } - ) - return configs + return { + "dataset": BibleMaghrebiDiacritizationDataset, + "dataset_args": {}, + "task": ArabicDiacritizationTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "max_tries": 3, + }, + } def prompt(input_sample): diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT35_ZeroShot.py index f4140bc5..638cb3fb 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT35_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruth.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_FewShot.py index 12f3f8bc..9ee86f2b 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_FewShot.py @@ -13,12 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruth.txt", - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruthDev.txt" - }, - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_ZeroShot.py index 40763548..f32fb228 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_GPT4_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruth.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_BLOOMZ_ZeroShot.py index 15607a9b..7e1f6c28 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_BLOOMZ_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_BLOOMZ_ZeroShot.py @@ -30,9 +30,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/all_v2.tsv", - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT35_ZeroShot.py index 8c71b114..570ef199 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT35_ZeroShot.py @@ -27,9 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/all_v2.tsv" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_FewShot.py index 4d03f6e0..b59a6377 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_FewShot.py @@ -28,11 +28,7 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/all_v2.tsv", - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/fewshot_dev.tsv", # TODO update - "deduplicate": False, - }, + "fewshot": {"deduplicate": False, "train_split": "dev"}, }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_ZeroShot.py index 7fe16686..76454ec9 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_GPT4_ZeroShot.py @@ -27,9 +27,6 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/all_v2.tsv" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_BLOOMZ_ZeroShot.py index 29ed7338..4f29e745 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_BLOOMZ_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_BLOOMZ_ZeroShot.py @@ -34,9 +34,6 @@ def config(): ], "max_tries": 0, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/QADI_test-PalestinePS-corrected.txt", - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT35_ZeroShot.py index e8bb3f5b..9d1ec884 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT35_ZeroShot.py @@ -31,9 +31,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/QADI_test-PalestinePS-corrected.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT4_ZeroShot.py index 5b0ec551..aab262de 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_GPT4_ZeroShot.py @@ -31,9 +31,6 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/QADI_test-PalestinePS-corrected.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_BLOOMZ_ZeroShot.py index 7bdd0ce4..d8c721df 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_BLOOMZ_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_BLOOMZ_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT35_ZeroShot.py index 4ab1f6ba..a9c6ca14 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT35_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT4_ZeroShot.py index 62d6a6d0..8f101963 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_GPT4_ZeroShot.py @@ -13,9 +13,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT35_ZeroShot.py index c3746373..4cca2bd0 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT35_ZeroShot.py @@ -15,9 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/Parsing/arabic_PADT_test_gs.conll" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_FewShot.py index b6af9ff9..701e5ce5 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_FewShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/parsing/arabic_PADT_test_gs.conll", - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/parsing/arabic_PADT_train.conll" - }, - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_ZeroShot.py index 298cb4f1..b758cebc 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_GPT4_ZeroShot.py @@ -15,9 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/Parsing/arabic_PADT_test_gs.conll" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT35_ZeroShot.py index 7517482b..2d9b81bc 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT35_ZeroShot.py @@ -6,33 +6,16 @@ def config(): - sets = [ - ("egy", "egy.seg/egy.data_5.test.src.sent"), - ("glf", "glf.seg/glf.data_5.test.src.sent"), - ("mgr", "mgr.seg/mgr.data_5.test.src.sent"), - ("lev", "lev.seg/lev.data_5.test.src.sent"), - ] - configs = [] - for name, testset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": QCRIDialectalArabicSegmentationDataset, - "dataset_args": {}, - "task": ArabicSegmentationTask, - "task_args": {}, - "model": LegacyOpenAIModel, - "model_args": { - "max_tries": 3, - }, - "general_args": { - "data_path": f"data/sequence_tagging_ner_pos_etc/segmentation/{testset}" - }, - }, - } - ) - return configs + return { + "dataset": QCRIDialectalArabicSegmentationDataset, + "dataset_args": {}, + "task": ArabicSegmentationTask, + "task_args": {}, + "model": LegacyOpenAIModel, + "model_args": { + "max_tries": 3, + }, + } def prompt(input_sample): diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_FewShot.py index aae1b4c1..82d41ccb 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_FewShot.py @@ -6,39 +6,26 @@ def config(): - sets = [ - ("egy", "egy.seg/egy.data_5.test.src.sent", "egy.seg/egy.data_5.dev.src.sent"), - ("glf", "glf.seg/glf.data_5.test.src.sent", "glf.seg/glf.data_5.dev.src.sent"), - ("mgr", "mgr.seg/mgr.data_5.test.src.sent", "mgr.seg/mgr.data_5.dev.src.sent"), - ("lev", "lev.seg/lev.data_5.test.src.sent", "lev.seg/lev.data_5.dev.src.sent"), - ] - - configs = [] - for name, testset, devset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": QCRIDialectalArabicSegmentationDataset, - "dataset_args": {}, - "task": ArabicSegmentationTask, - "task_args": {}, - "model": OpenAIModel, - "model_args": { - "max_tries": 3, - }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/segmentation/" - + testset, - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/segmentation/" - + devset - }, - }, - }, + return { + "dataset": QCRIDialectalArabicSegmentationDataset, + "dataset_args": {}, + "task": ArabicSegmentationTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "max_tries": 3, + }, + "general_args": { + "fewshot": { + "train_split": [ + "glf.data_5/dev", + "lev.data_5/dev", + "egy.data_5/dev", + "mgr.data_5/dev", + ], } - ) - return configs + }, + } def few_shot_prompt(input_sample, base_prompt, examples): diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_ZeroShot.py index 8431e344..6fdbc0b8 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_GPT4_ZeroShot.py @@ -6,34 +6,16 @@ def config(): - sets = [ - ("egy", "egy.seg/egy.data_5.test.src.sent"), - ("glf", "glf.seg/glf.data_5.test.src.sent"), - ("mgr", "mgr.seg/mgr.data_5.test.src.sent"), - ("lev", "lev.seg/lev.data_5.test.src.sent"), - ] - configs = [] - for name, testset in sets: - configs.append( - { - "name": name, - "config": { - "dataset": QCRIDialectalArabicSegmentationDataset, - "dataset_args": {}, - "task": ArabicSegmentationTask, - "task_args": {}, - "model": OpenAIModel, - "model_args": { - "max_tries": 3, - }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/segmentation/" - + testset - }, - }, - } - ) - return configs + return { + "dataset": QCRIDialectalArabicSegmentationDataset, + "dataset_args": {}, + "task": ArabicSegmentationTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "max_tries": 3, + }, + } def prompt(input_sample): diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT35_ZeroShot.py index ab439a2e..0d8a76bc 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT35_ZeroShot.py @@ -15,9 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/segmentation/WikiNewsTruth.txt" - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_FewShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_FewShot.py index 5a3b93ce..4c927a13 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_FewShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_FewShot.py @@ -15,12 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/segmentation/WikiNewsTruth.txt", - "fewshot": { - "train_data_path": "data/sequence_tagging_ner_pos_etc/segmentation/WikiNewsTruthDev.txt" - }, - }, } diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_ZeroShot.py index 2d29556c..4767bd36 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_GPT4_ZeroShot.py @@ -15,9 +15,6 @@ def config(): "model_args": { "max_tries": 3, }, - "general_args": { - "data_path": "data/sequence_tagging_ner_pos_etc/segmentation/WikiNewsTruth.txt" - }, } diff --git a/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py index 62b19237..b46cc26e 100644 --- a/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py +++ b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/bulgarian/CT22_bulgarian_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "bg"}, } diff --git a/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py index ce10ac99..b4a71f93 100644 --- a/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py +++ b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py @@ -16,12 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/bulgarian/CT22_bulgarian_1A_checkworthy_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/checkworthyness/bulgarian/CT22_bulgarian_1A_checkworthy_train.tsv", - }, - }, + "general_args": {"test_split": "bg", "fewshot": {"train_split": "bg"}}, } diff --git a/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py index 09c14fe5..25c359f4 100644 --- a/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py +++ b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py @@ -16,9 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/bulgarian/CT22_bulgarian_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "bg"}, } diff --git a/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py index 5d0f08b8..0a9619ef 100644 --- a/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py +++ b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_BLOOMZ_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["Positive", "Negative", "Neutral"], "max_tries": 10, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sentiment/bn/bn_all_test.tsv" - }, } diff --git a/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_FewShot.py b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_FewShot.py index 195fcf30..23026396 100644 --- a/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_FewShot.py +++ b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_FewShot.py @@ -14,12 +14,6 @@ def config(): "class_labels": ["Positive", "Negative", "Neutral"], "max_tries": 20, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sentiment/bn/bn_all_test.tsv", - "fewshot": { - "train_data_path": "data/sentiment_emotion_others/sentiment/bn/bn_all_train.tsv", - }, - }, } diff --git a/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_ZeroShot.py b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_ZeroShot.py index b2cd5dc8..272a173e 100644 --- a/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_ZeroShot.py +++ b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_GPT4_ZeroShot.py @@ -14,9 +14,6 @@ def config(): "class_labels": ["Positive", "Negative", "Neutral"], "max_tries": 20, }, - "general_args": { - "data_path": "data/sentiment_emotion_others/sentiment/bn/bn_all_test.tsv", - }, } diff --git a/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py index d52e0c58..6045c479 100644 --- a/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py +++ b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ge_dev_subtask3.json" - }, + "general_args": {"test_split": "de/dev"}, } diff --git a/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py index 6e97d472..b7734863 100644 --- a/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py +++ b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py @@ -44,10 +44,8 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ge_dev_subtask3.json", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ge_train_subtask3.json", - }, + "test_split": "de/dev", + "fewshot": {"train_split": "de/train"}, }, } diff --git a/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py index 745c854a..6be7390f 100644 --- a/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py +++ b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ge_dev_subtask3.json", - }, + "general_args": {"test_split": "de/dev"}, } diff --git a/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py index bfda9c8f..8d41b16d 100644 --- a/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py +++ b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "en"}, } diff --git a/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py index 44d6e6b1..1fc3bd00 100644 --- a/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py +++ b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py @@ -16,12 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_train.tsv", - }, - }, + "general_args": {"test_split": "en", "fewshot": {"train_split": "en"}}, } diff --git a/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py index e2690657..d357a80c 100644 --- a/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py +++ b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py @@ -16,9 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "en"}, } diff --git a/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py index 431165bc..1ec435ae 100644 --- a/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py +++ b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/en_dev_subtask3.json" - }, + "general_args": {"test_split": "en/dev"}, } diff --git a/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py index eb05398e..f0067fba 100644 --- a/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py +++ b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py @@ -44,10 +44,8 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/en_dev_subtask3.json", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/en_train_subtask3.json", - }, + "test_split": "en/dev", + "fewshot": {"train_split": "en/train"}, }, } diff --git a/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py index 6ae57afc..c515f690 100644 --- a/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py +++ b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/en_dev_subtask3.json", - }, + "general_args": {"test_split": "en/dev"}, } diff --git a/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py index bda9962d..b1ae7bd1 100644 --- a/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py +++ b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/spanish/CT22_spanish_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "es"}, } diff --git a/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py index 8f4871ea..b378b0fe 100644 --- a/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py +++ b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py @@ -16,12 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/spanish/CT22_spanish_1A_checkworthy_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/checkworthyness/spanish/CT22_spanish_1A_checkworthy_train.tsv", - }, - }, + "general_args": {"test_split": "es", "fewshot": {"train_split": "es"}}, } diff --git a/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py index b58737df..d1b75a11 100644 --- a/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py +++ b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py @@ -16,9 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/spanish/CT22_spanish_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "es"}, } diff --git a/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py index ddc8b858..577634ad 100644 --- a/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py +++ b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/fr_dev_subtask3.json" - }, + "general_args": {"test_split": "fr/dev"}, } diff --git a/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py index f5fd03ad..ed0fbe42 100644 --- a/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py +++ b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py @@ -44,10 +44,8 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/fr_dev_subtask3.json", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/fr_train_subtask3.json", - }, + "test_split": "fr/dev", + "fewshot": {"train_split": "fr/train"}, }, } diff --git a/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py index 41f2faaa..925770d3 100644 --- a/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py +++ b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/fr_dev_subtask3.json", - }, + "general_args": {"test_split": "fr/dev"}, } diff --git a/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py index 38f7828d..122cddf6 100644 --- a/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py +++ b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/it_dev_subtask3.json" - }, + "general_args": {"test_split": "it/dev"}, } diff --git a/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py index fa717aed..c7a6a6a5 100644 --- a/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py +++ b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py @@ -44,10 +44,8 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/it_dev_subtask3.json", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/it_train_subtask3.json", - }, + "test_split": "it/dev", + "fewshot": {"train_split": "it/train"}, }, } diff --git a/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py index 518be26b..79b2df2c 100644 --- a/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py +++ b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/it_dev_subtask3.json", - }, + "general_args": {"test_split": "it/dev"}, } diff --git a/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py index 87f2a978..0f7528c0 100644 --- a/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py +++ b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/dutch/CT22_dutch_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "nl"}, } diff --git a/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py index 87f172a8..6b2b614c 100644 --- a/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py +++ b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py @@ -16,12 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/dutch/CT22_dutch_1A_checkworthy_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/checkworthyness/dutch/CT22_dutch_1A_checkworthy_train.tsv", - }, - }, + "general_args": {"test_split": "nl", "fewshot": {"train_split": "nl"}}, } diff --git a/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py index 64cac5f4..f43f6fc4 100644 --- a/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py +++ b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py @@ -16,9 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/dutch/CT22_dutch_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "nl"}, } diff --git a/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py index 37e12c1d..eb1e0259 100644 --- a/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py +++ b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/po_dev_subtask3.json" - }, + "general_args": {"test_split": "pl/dev"}, } diff --git a/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py index 54f5cb09..7d169729 100644 --- a/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py +++ b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py @@ -44,10 +44,8 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/po_dev_subtask3.json", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/po_train_subtask3.json", - }, + "test_split": "pl/dev", + "fewshot": {"train_split": "pl/train"}, }, } diff --git a/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py index f46b8385..1e637551 100644 --- a/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py +++ b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/po_dev_subtask3.json", - }, + "general_args": {"test_split": "pl/dev"}, } diff --git a/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py index 2acb5da6..2ce2e9ab 100644 --- a/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py +++ b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_BLOOMZ_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ru_dev_subtask3.json" - }, + "general_args": {"test_split": "ru/dev"}, } diff --git a/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py index 93c86922..c5f07948 100644 --- a/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py +++ b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_FewShot.py @@ -44,10 +44,8 @@ def config(): "max_tries": 30, }, "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ru_dev_subtask3.json", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ru_train_subtask3.json", - }, + "test_split": "ru/dev", + "fewshot": {"train_split": "ru/train"}, }, } diff --git a/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py index addf2585..843b7adb 100644 --- a/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py +++ b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_GPT4_ZeroShot.py @@ -43,9 +43,7 @@ def config(): ], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ru_dev_subtask3.json", - }, + "general_args": {"test_split": "ru/dev"}, } diff --git a/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py index 776c998c..af3d3518 100644 --- a/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py +++ b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_BLOOMZ_ZeroShot.py @@ -14,9 +14,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 3, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/turkish/CT22_turkish_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "tr"}, } diff --git a/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py index abf77031..55c81ab0 100644 --- a/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py +++ b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot.py @@ -16,12 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/turkish/CT22_turkish_1A_checkworthy_test_gold.tsv", - "fewshot": { - "train_data_path": "data/factuality_disinformation_harmful_content/checkworthyness/turkish/CT22_turkish_1A_checkworthy_train.tsv", - }, - }, + "general_args": {"test_split": "tr", "fewshot": {"train_split": "tr"}}, } diff --git a/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py index 315e9174..182f5fdf 100644 --- a/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py +++ b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot.py @@ -16,9 +16,7 @@ def config(): "class_labels": ["0", "1"], "max_tries": 30, }, - "general_args": { - "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/turkish/CT22_turkish_1A_checkworthy_test_gold.tsv" - }, + "general_args": {"test_split": "tr"}, } diff --git a/llmebench/benchmark.py b/llmebench/benchmark.py index a895a8e1..c7637e39 100644 --- a/llmebench/benchmark.py +++ b/llmebench/benchmark.py @@ -19,6 +19,7 @@ class SingleTaskBenchmark(object): def __init__( self, + name, config, prompt_fn, post_process_fn, @@ -28,10 +29,19 @@ def __init__( limit=-1, n_shots=0, ): + self.name = name + # Pipeline components - self.dataset = config["dataset"](**config["dataset_args"]) - self.task = config["task"](dataset=self.dataset, **config["task_args"]) - self.model = config["model"](**config["model_args"]) + dataset_args = config.get("dataset_args", {}) + self.dataset = config["dataset"](**dataset_args) + + task_args = config.get("task_args", {}) + self.task = config["task"](dataset=self.dataset, **task_args) + + model_args = config.get("model_args", {}) + self.model = config["model"](**model_args) + + general_args = config.get("general_args", {}) # Caching parameters self.cache_dir = cache_dir @@ -43,14 +53,19 @@ def __init__( self.post_process_fn = post_process_fn # Data parameters - self.data_path = config["general_args"]["data_path"] + self.data_paths = utils.get_data_paths(config, "test") + self.zeroshot = True - if "fewshot" in config["general_args"]: + if utils.is_fewshot_asset(config, prompt_fn): self.zeroshot = False - self.train_data_path = config["general_args"]["fewshot"]["train_data_path"] - self.deduplicate = config["general_args"]["fewshot"].get( - "deduplicate", True - ) + self.deduplicate = True + self.train_data_paths = utils.get_data_paths(config, "train") + + assert len(self.data_paths) == len( + self.train_data_paths + ), "A train split must be provided for every test split being run" + if "fewshot" in general_args: + self.deduplicate = general_args["fewshot"].get("deduplicate", True) self.limit = limit self.n_shots = n_shots @@ -59,7 +74,12 @@ def is_zeroshot(self): return self.zeroshot def run_pipeline( - self, sample_key, input_sample, few_shot_examples, cache_payload=None + self, + sample_key, + input_sample, + few_shot_examples, + cache_payload=None, + dry_run=False, ): summarized_payload = {} @@ -75,6 +95,9 @@ def run_pipeline( prompt = self.prompt_fn(input_sample) cache_payload["prompt"] = prompt + if dry_run: + return cache_payload, summarized_payload + # Run the model if "model_output" in cache_payload: logging.info(f"\tLoading model output from cache") @@ -115,101 +138,130 @@ def run_pipeline( return cache_payload, summarized_payload - def run_benchmark(self): - # Handle cache - if not self.is_zeroshot(): - self.cache_dir = self.cache_dir / f"{self.n_shots}_shot" - - # Create parent directory - if not self.cache_dir.exists(): - self.cache_dir.mkdir(parents=True) - - # Local cache - full_summary_path = self.cache_dir / "summary.jsonl" - failed_summary_path = self.cache_dir / "summary_failed.jsonl" - - data = self.dataset.load_data(self.data_path) - few_shots_data = [] - if not self.zeroshot: - train_data = self.dataset.load_data(self.train_data_path) + def run_benchmark(self, dry_run=False): + base_name = self.name + base_cache_dir = self.cache_dir - few_shots_data = self.dataset.prepare_fewshots( - data, train_data, self.n_shots, deduplicate=self.deduplicate - ) - - true_labels = [] - predictions = [] + # Create sub-directory for few shot experiments + if not self.is_zeroshot(): + base_name = f"{self.name}/{self.n_shots}_shot" + base_cache_dir = self.cache_dir / f"{self.n_shots}_shot" + + all_task_results = {} + for split_idx, (split_name, data_path) in enumerate(self.data_paths): + name = base_name + cache_dir = base_cache_dir + if len(self.data_paths) > 1: + name = f"{self.name}/{split_name}" + cache_dir = cache_dir / split_name + + # Create parent directory + if not cache_dir.exists(): + cache_dir.mkdir(parents=True) + + # Local cache + full_summary_path = cache_dir / "summary.jsonl" + failed_summary_path = cache_dir / "summary_failed.jsonl" + + data = self.dataset.load_data(data_path) + few_shots_data = [] + if not self.zeroshot: + train_split_name, train_data_path = self.train_data_paths[split_idx] + train_data = self.dataset.load_data(train_data_path) + + few_shots_data = self.dataset.prepare_fewshots( + data, train_data, self.n_shots, deduplicate=self.deduplicate + ) - num_processed = 0 - full_summary_fp = open(full_summary_path, "w") + true_labels = [] + predictions = [] + + num_processed = 0 + full_summary_fp = open(full_summary_path, "w") + + num_failed = 0 + failed_summary_fp = open(failed_summary_path, "w") + + for sample_idx, (input_sample, few_shot_examples) in enumerate( + zip_longest(data, few_shots_data, fillvalue=None) + ): + if self.limit > 0 and sample_idx >= self.limit: + break + logging.info(f"Running sample {sample_idx}: {input_sample['input']}") + num_processed += 1 + cache_path = cache_dir / f"{sample_idx}.json" + true_labels.append(input_sample["label"]) + + cache_payload = {"input": input_sample} + + if few_shot_examples is not None: + cache_payload["few_shot_examples"] = few_shot_examples + + if cache_path.exists() and not self.ignore_cache and not dry_run: + with open(cache_path, "r") as fp: + cache_payload = json.load(fp) + + summarized_payload = { + "input": input_sample["input"], + "label": input_sample["label"], + } + + cache_payload, partial_summarized_payload = self.run_pipeline( + sample_idx, + input_sample["input"], + few_shot_examples, + cache_payload, + dry_run, + ) - num_failed = 0 - failed_summary_fp = open(failed_summary_path, "w") + summarized_payload.update(partial_summarized_payload) - for sample_idx, (input_sample, few_shot_examples) in enumerate( - zip_longest(data, few_shots_data, fillvalue=None) - ): - if self.limit > 0 and sample_idx >= self.limit: - break - logging.info(f"Running sample {sample_idx}: {input_sample['input']}") - num_processed += 1 - cache_path = self.cache_dir / f"{sample_idx}.json" - true_labels.append(input_sample["label"]) + if "filtered_output" in cache_payload: + predictions.append(cache_payload["filtered_output"]) + full_summary_fp.write( + json.dumps(summarized_payload, ensure_ascii=False) + "\n" + ) + else: + if not dry_run: + logging.error(f"\tNo prediction for sample") + num_failed += 1 + predictions.append(None) + full_summary_fp.write( + json.dumps(summarized_payload, ensure_ascii=False) + "\n" + ) + failed_summary_fp.write( + json.dumps(summarized_payload, ensure_ascii=False) + "\n" + ) - cache_payload = {"input": input_sample} + # Save the cache payload + with open(cache_path, "w") as fp: + json.dump(cache_payload, fp, ensure_ascii=False) - if few_shot_examples is not None: - cache_payload["few_shot_examples"] = few_shot_examples + full_summary_fp.close() + failed_summary_fp.close() - if cache_path.exists() and not self.ignore_cache: - with open(cache_path, "r") as fp: - cache_payload = json.load(fp) + if num_failed > 0: + logging.error( + f"{num_failed}/{len(data)} samples do not have any predictions" + ) + evaluation_scores = self.task.evaluate(true_labels, predictions) - summarized_payload = { - "input": input_sample["input"], - "label": input_sample["label"], + # Prepare results + task_results = { + "num_processed": num_processed, + "num_failed": num_failed, + "evaluation_scores": evaluation_scores, } + logging.info(f"{name}: {task_results['evaluation_scores']}") - cache_payload, partial_summarized_payload = self.run_pipeline( - sample_idx, input_sample["input"], few_shot_examples, cache_payload - ) - - summarized_payload.update(partial_summarized_payload) - - if "filtered_output" in cache_payload: - predictions.append(cache_payload["filtered_output"]) - full_summary_fp.write( - json.dumps(summarized_payload, ensure_ascii=False) + "\n" - ) - else: - logging.error(f"\tNo prediction for sample") - num_failed += 1 - predictions.append(None) - full_summary_fp.write( - json.dumps(summarized_payload, ensure_ascii=False) + "\n" - ) - failed_summary_fp.write( - json.dumps(summarized_payload, ensure_ascii=False) + "\n" - ) - - # Save the cache payload - with open(cache_path, "w") as fp: - json.dump(cache_payload, fp, ensure_ascii=False) + task_result_path = cache_dir / "results.json" - full_summary_fp.close() - failed_summary_fp.close() + with open(task_result_path, "w") as fp: + json.dump(task_results, fp, ensure_ascii=False) - if num_failed > 0: - logging.error( - f"{num_failed}/{len(data)} samples do not have any predictions" - ) - evaluation_scores = self.task.evaluate(true_labels, predictions) + all_task_results[name] = task_results - return { - "num_processed": num_processed, - "num_failed": num_failed, - "evaluation_scores": evaluation_scores, - } + return all_task_results class Benchmark(object): @@ -293,6 +345,12 @@ def main(): "-e", "--env", type=Path, help="Path to an .env file to load model parameters" ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Do not run any actual models, but load all the data and process few shots. Existing cache will be ignored and overwritten.", + ) + group = parser.add_argument_group("Few Shot Experiments") group.add_argument( "-n", @@ -341,6 +399,7 @@ def main(): try: logging.info(f"Running benchmark: {name}") task_benchmark = SingleTaskBenchmark( + name, config, prompt_fn, post_process_fn, @@ -362,18 +421,11 @@ def main(): ) continue - task_results = task_benchmark.run_benchmark() - logging.info(f"{name}: {task_results['evaluation_scores']}") - - task_result_path = task_benchmark.cache_dir / "results.json" - - with open(task_result_path, "w") as fp: - json.dump(task_results, fp, ensure_ascii=False) - - if not task_benchmark.is_zeroshot(): - name = f"{name}_{task_benchmark.n_shots}" + all_task_results = task_benchmark.run_benchmark(dry_run=args.dry_run) + for task_name in all_task_results: + task_results = all_task_results[task_name] - all_results[name] = task_results + all_results[task_name] = task_results except Exception as e: logging.error(f"{name} failed to run") traceback.print_exc() diff --git a/llmebench/datasets/ADI.py b/llmebench/datasets/ADI.py index 580775ba..960e1308 100644 --- a/llmebench/datasets/ADI.py +++ b/llmebench/datasets/ADI.py @@ -1,19 +1,44 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class ADIDataset(DatasetBase): def __init__(self, **kwargs): super(ADIDataset, self).__init__(**kwargs) - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "some tweet", "label": "no_not_interesting"} + @staticmethod def metadata(): return { "language": "ar", "citation": """TO DO: in house dataset""", + "splits": { + "dev": "data/sequence_tagging_ner_pos_etc/dialect_identification/fewshot_dev.tsv", + "test": "data/sequence_tagging_ner_pos_etc/dialect_identification/all_v2.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": [ + "EGY", + "IRA", + "JOR", + "KSA", + "KUW", + "LEB", + "LIB", + "MOR", + "MSA", + "PAL", + "QAT", + "SUD", + "SYR", + "UAE", + "YEM", + ], } def load_data(self, data_path): diff --git a/llmebench/datasets/ANERcorp.py b/llmebench/datasets/ANERcorp.py index ed82fde3..6fe3fa2d 100644 --- a/llmebench/datasets/ANERcorp.py +++ b/llmebench/datasets/ANERcorp.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class ANERcorpDataset(DatasetBase): def __init__(self, **kwargs): super(ANERcorpDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -45,9 +47,28 @@ def metadata(): language = "English", ISBN = "979-10-95546-34-4", }""", + "link": "https://camel.abudhabi.nyu.edu/anercorp/", + "license": "CC BY-SA 4.0", + "splits": { + "test": "data/sequence_tagging_ner_pos_etc/NER/AnerCorp/ANERCorp_CamelLab_test.txt", + "train": "data/sequence_tagging_ner_pos_etc/NER/AnerCorp/ANERCorp_CamelLab_train.txt", + }, + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "B-PERS", + "I-PERS", + "B-LOC", + "I-LOC", + "B-ORG", + "I-ORG", + "B-MISC", + "I-MISC", + "O", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": ".كانت السبب الرئيس في سقوط البيزنطيين بسبب الدمار الذي كانت تخلفه الحملات الأولى المارة في بيزنطة ( مدينة القسطنطينية ) عاصمة الإمبراطورية البيزنطية وتحول حملات لاحقة نحوها", "label": "O O O O O B-PER O O O O O O O O O B-LOC O O B-LOC O O B-LOC I-LOC O O O O O", diff --git a/llmebench/datasets/Khouja20Factuality.py b/llmebench/datasets/ANSFactuality.py similarity index 78% rename from llmebench/datasets/Khouja20Factuality.py rename to llmebench/datasets/ANSFactuality.py index 06ce5e19..5e1f5196 100644 --- a/llmebench/datasets/Khouja20Factuality.py +++ b/llmebench/datasets/ANSFactuality.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType -class Khouja20FactualityDataset(DatasetBase): +class ANSFactualityDataset(DatasetBase): def __init__(self, **kwargs): - super(Khouja20FactualityDataset, self).__init__(**kwargs) + super(ANSFactualityDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -21,9 +23,18 @@ def metadata(): pages = "8--17", abstract = "This work explores the application of textual entailment in news claim verification and stance prediction using a new corpus in Arabic. The publicly available corpus comes in two perspectives: a version consisting of 4,547 true and false claims and a version consisting of 3,786 pairs (claim, evidence). We describe the methodology for creating the corpus and the annotation process. Using the introduced corpus, we also develop two machine learning baselines for two proposed tasks: claim verification and stance prediction. Our best model utilizes pretraining (BERT) and achieves 76.7 F1 on the stance prediction task and 64.3 F1 on the claim verification task. Our preliminary experiments shed some light on the limits of automatic claim verification that relies on claims text only. Results hint that while the linguistic features and world knowledge learned during pretraining are useful for stance prediction, such learned representations from pretraining are insufficient for verifying claims without access to context or evidence.", }""", + "link": "https://github.com/latynt/ans", + "download_url": "https://github.com/latynt/ans/archive/refs/heads/master.zip", + "splits": { + "test": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/test.csv", + "train": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/train.csv", + }, + "task_type": TaskType.Classification, + "class_labels": ["true", "false"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "الجملة بالعربية", "label": "true", "line_number": "1"} def load_data(self, data_path, no_labels=False): diff --git a/llmebench/datasets/Khouja20Stance.py b/llmebench/datasets/ANSStance.py similarity index 78% rename from llmebench/datasets/Khouja20Stance.py rename to llmebench/datasets/ANSStance.py index 45b66f35..0459fe8e 100644 --- a/llmebench/datasets/Khouja20Stance.py +++ b/llmebench/datasets/ANSStance.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType -class Khouja20StanceDataset(DatasetBase): +class ANSStanceDataset(DatasetBase): def __init__(self, **kwargs): - super(Khouja20StanceDataset, self).__init__(**kwargs) + super(ANSStanceDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -21,9 +23,18 @@ def metadata(): pages = "8--17", abstract = "This work explores the application of textual entailment in news claim verification and stance prediction using a new corpus in Arabic. The publicly available corpus comes in two perspectives: a version consisting of 4,547 true and false claims and a version consisting of 3,786 pairs (claim, evidence). We describe the methodology for creating the corpus and the annotation process. Using the introduced corpus, we also develop two machine learning baselines for two proposed tasks: claim verification and stance prediction. Our best model utilizes pretraining (BERT) and achieves 76.7 F1 on the stance prediction task and 64.3 F1 on the claim verification task. Our preliminary experiments shed some light on the limits of automatic claim verification that relies on claims text only. Results hint that while the linguistic features and world knowledge learned during pretraining are useful for stance prediction, such learned representations from pretraining are insufficient for verifying claims without access to context or evidence.", }""", + "link": "https://github.com/latynt/ans", + "download_url": "https://github.com/latynt/ans/archive/refs/heads/master.zip", + "splits": { + "test": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/test.csv", + "train": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/train.csv", + }, + "task_type": TaskType.Classification, + "class_labels": ["agree", "disagree"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": { "sentence_1": "الجملة الاولى", diff --git a/llmebench/datasets/ARCD.py b/llmebench/datasets/ARCD.py index 0ef13ec9..f4e635a4 100644 --- a/llmebench/datasets/ARCD.py +++ b/llmebench/datasets/ARCD.py @@ -1,12 +1,14 @@ import json from llmebench.datasets.SQuADBase import SQuADBase +from llmebench.tasks import TaskType class ARCDDataset(SQuADBase): def __init__(self, **kwargs): super(ARCDDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -26,4 +28,11 @@ def metadata(): pages = "108--118", abstract = "This paper tackles the problem of open domain factual Arabic question answering (QA) using Wikipedia as our knowledge source. This constrains the answer of any question to be a span of text in Wikipedia. Open domain QA for Arabic entails three challenges: annotated QA datasets in Arabic, large scale efficient information retrieval and machine reading comprehension. To deal with the lack of Arabic QA datasets we present the Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles, and a machine translation of the Stanford Question Answering Dataset (Arabic-SQuAD). Our system for open domain question answering in Arabic (SOQAL) is based on two components: (1) a document retriever using a hierarchical TF-IDF approach and (2) a neural reading comprehension model using the pre-trained bi-directional transformer BERT. Our experiments on ARCD indicate the effectiveness of our approach with our BERT-based reader achieving a 61.3 F1 score, and our open domain system SOQAL achieving a 27.6 F1 score.", }""", + "link": "https://github.com/husseinmozannar/SOQAL", + "license": "MIT License", + "splits": { + "test": "data/QA/ARCD/arcd-test.json", + "train": "data/QA/ARCD/arcd-train.json", + }, + "task_type": TaskType.QuestionAnswering, } diff --git a/llmebench/datasets/NewsCatASND.py b/llmebench/datasets/ASND.py similarity index 58% rename from llmebench/datasets/NewsCatASND.py rename to llmebench/datasets/ASND.py index a9612899..3254ccd0 100644 --- a/llmebench/datasets/NewsCatASND.py +++ b/llmebench/datasets/ASND.py @@ -1,15 +1,18 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType -class NewsCatASNDDataset(DatasetBase): +class ASNDDataset(DatasetBase): def __init__(self, **kwargs): - super(NewsCatASNDDataset, self).__init__(**kwargs) + super(ASNDDataset, self).__init__(**kwargs) - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "some tweet", "label": "crime-war-conflict"} + @staticmethod def metadata(): return { "language": "ar", @@ -29,6 +32,27 @@ def metadata(): url = "https://aclanthology.org/2020.wanlp-1.21", pages = "226--236", }""", + "link": "https://github.com/shammur/Arabic_news_text_classification_datasets/", + "license": "CC BY 4.0", + "splits": { + "test": "data/news_categorization/Arabic_Social_Media_News_Dataset_ASND/sm_news_ar_tst.csv", + "train": "data/news_categorization/Arabic_Social_Media_News_Dataset_ASND/sm_news_ar_trn.csv", + }, + "task_type": TaskType.Classification, + "class_labels": [ + "crime-war-conflict", + "spiritual", + "health", + "politics", + "human-rights-press-freedom", + "education", + "business-and-economy", + "art-and-entertainment", + "others", + "science-and-technology", + "sports", + "environment", + ], } def load_data(self, data_path): diff --git a/llmebench/datasets/Adult.py b/llmebench/datasets/Adult.py index 6a406649..35111879 100644 --- a/llmebench/datasets/Adult.py +++ b/llmebench/datasets/Adult.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class AdultDataset(DatasetBase): def __init__(self, **kwargs): super(AdultDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -21,9 +23,18 @@ def metadata(): url = "https://aclanthology.org/2021.wanlp-1.14", pages = "136--144", }""", + "link": "https://alt.qcri.org/resources/AdultContentDetection.zip", + "license": "Research Purpose Only", + "splits": { + "test": "data/factuality_disinformation_harmful_content/adult/adult-test.tsv", + "train": "data/factuality_disinformation_harmful_content/adult/adult-train.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": ["ADULT", "NOT_ADULT"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "نص عادي", "label": "NOT_ADULT"} def load_data(self, data_path, no_labels=False): diff --git a/llmebench/datasets/Aqmar.py b/llmebench/datasets/Aqmar.py index a5fb1473..ce66e1f9 100644 --- a/llmebench/datasets/Aqmar.py +++ b/llmebench/datasets/Aqmar.py @@ -1,6 +1,7 @@ from pathlib import Path from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class AqmarDataset(DatasetBase): @@ -45,6 +46,7 @@ def __init__(self, **kwargs): ], ) + @staticmethod def metadata(): return { "language": "ar", @@ -63,9 +65,34 @@ def metadata(): url = \"https://aclanthology.org/E12-1017\", pages = \"162--173\", }""", + "link": "http://www.cs.cmu.edu/~ark/AQMAR/", + "license": "CC BY-SA 3.0", + "splits": { + "test": { + "split": "test", + "path": "data/sequence_tagging_ner_pos_etc/NER/aqmar/AQMAR_Arabic_NER_corpus-1.0", + }, + "dev": { + "split": "dev", + "path": "data/sequence_tagging_ner_pos_etc/NER/aqmar/AQMAR_Arabic_NER_corpus-1.0", + }, + }, + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "B-PERS", + "I-PERS", + "B-LOC", + "I-LOC", + "B-ORG", + "I-ORG", + "B-MISC", + "I-MISC", + "O", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": ".كانت السبب الرئيس في سقوط البيزنطيين بسبب الدمار الذي كانت تخلفه الحملات الأولى المارة في بيزنطة ( مدينة القسطنطينية ) عاصمة الإمبراطورية البيزنطية وتحول حملات لاحقة نحوها", "label": "O O O O O B-PER O O O O O O O O O B-LOC O O B-LOC O O B-LOC I-LOC O O O O O", diff --git a/llmebench/datasets/ArSAS.py b/llmebench/datasets/ArSAS.py index a5bf0858..b4b2d52d 100644 --- a/llmebench/datasets/ArSAS.py +++ b/llmebench/datasets/ArSAS.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class ArSASDataset(DatasetBase): def __init__(self, **kwargs): super(ArSASDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -13,13 +15,21 @@ def metadata(): author={AbdelRahim Elmadany and Hamdy Mubarak and Walid Magdy}, year={2018} }""", + "link": "https://homepages.inf.ed.ac.uk/wmagdy/resources.htm", + "license": "Research Purpose Only", + "splits": { + "test": "data/sentiment_emotion_others/sentiment/ArSAS-test.txt", + "train": "data/sentiment_emotion_others/sentiment/ArSAS-train.txt", + }, + "task_type": TaskType.Classification, + "class_labels": ["Positive", "Negative", "Neutral", "Mixed"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Tweet", "label": "Positive"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator data = [] with open(data_path, "r") as fp: for line_idx, line in enumerate(fp): diff --git a/llmebench/datasets/ArSarcasm.py b/llmebench/datasets/ArSarcasm.py index 889bf151..ac1182e6 100644 --- a/llmebench/datasets/ArSarcasm.py +++ b/llmebench/datasets/ArSarcasm.py @@ -1,12 +1,14 @@ import csv from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class ArSarcasmDataset(DatasetBase): def __init__(self, **kwargs): super(ArSarcasmDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -23,9 +25,18 @@ def metadata(): language = "English", ISBN = "979-10-95546-51-1", }""", + "link": "https://github.com/iabufarha/ArSarcasm", + "license": "MIT License", + "splits": { + "test": "data/sentiment_emotion_others/sarcasm/ArSarcasm/ArSarcasm_test.csv", + "train": "data/sentiment_emotion_others/sarcasm/ArSarcasm/ArSarcasm_train.csv", + }, + "task_type": TaskType.Classification, + "class_labels": ["TRUE", "FALSE"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "A tweet", "label": "TRUE"} def load_data(self, data_path): diff --git a/llmebench/datasets/ArSarcasm2.py b/llmebench/datasets/ArSarcasm2.py new file mode 100644 index 00000000..5c606467 --- /dev/null +++ b/llmebench/datasets/ArSarcasm2.py @@ -0,0 +1,53 @@ +import csv + +from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType + + +class ArSarcasm2Dataset(DatasetBase): + def __init__(self, **kwargs): + super(ArSarcasm2Dataset, self).__init__(**kwargs) + + @staticmethod + def metadata(): + return { + "language": "ar", + "citation": """@inproceedings{abufarha-etal-2021-arsarcasm-v2, + title = "Overview of the WANLP 2021 Shared Task on Sarcasm and Sentiment Detection in Arabic", + author = "Abu Farha, Ibrahim and + Zaghouani, Wajdi and + Magdy, Walid", + booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop", + month = april, + year = "2021", + }""", + "link": "https://github.com/iabufarha/ArSarcasm-v2", + "license": "MIT License", + "splits": { + "test": "data/sentiment_emotion_others/sarcasm/ArSarcasm2/testing_data.csv", + "train": "data/sentiment_emotion_others/sarcasm/ArSarcasm2/training_data.csv", + }, + "task_type": TaskType.Classification, + "class_labels": ["TRUE", "FALSE"], + } + + @staticmethod + def get_data_sample(): + return {"input": "A tweet", "label": "TRUE"} + + def load_data(self, data_path): + data = [] + with open(data_path, "r", encoding="utf-8") as fp: + reader = csv.DictReader(fp) + for line_idx, row in enumerate(reader): + data.append( + { + "input": row["tweet"], + "label": row[ + "sarcasm" + ].upper(), # To get it to work on ArSarcasm (True/False) and ArSarcasm-2 (TRUE/FALSE) + "line_number": line_idx, + } + ) + + return data diff --git a/llmebench/datasets/AraBench.py b/llmebench/datasets/AraBench.py index 52211582..35200220 100644 --- a/llmebench/datasets/AraBench.py +++ b/llmebench/datasets/AraBench.py @@ -1,12 +1,14 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class AraBenchDataset(DatasetBase): - def __init__(self, src, tgt, **kwargs): + def __init__(self, src_lang, tgt_lang, **kwargs): super(AraBenchDataset, self).__init__(**kwargs) - self.src = src - self.tgt = tgt + self.src_lang = src_lang + self.tgt_lang = tgt_lang + @staticmethod def metadata(): return { "language": "ar", @@ -25,17 +27,258 @@ def metadata(): doi = "10.18653/v1/2020.coling-main.447", pages = "5094--5107" }""", + "link": "https://alt.qcri.org/resources1/mt/arabench/", + "license": "Apache License, Version 2.0", + "splits": { + "APT-LEV_ldc_web_lv.lev.0.lv": { + "dev": "data/MT/ldc_web_lv.dev.lev.0.lv", + "test": "data/MT/ldc_web_lv.test.lev.0.lv", + "train": "data/MT/ldc_web_lv.train.lev.0.lv", + }, + "APT-Nile_ldc_web_eg.nil.0.eg": { + "dev": "data/MT/ldc_web_eg.dev.nil.0.eg", + "test": "data/MT/ldc_web_eg.test.nil.0.eg", + "train": "data/MT/ldc_web_eg.train.nil.0.eg", + }, + "Bible-MGR_bible.mgr.0.ma": { + "dev": "data/MT/bible.dev.mgr.0.ma", + "test": "data/MT/bible.test.mgr.0.ma", + "train": "data/MT/bible.train.mgr.0.ma", + }, + "Bible-MGR_bible.mgr.0.tn": { + "dev": "data/MT/bible.dev.mgr.0.tn", + "test": "data/MT/bible.test.mgr.0.tn", + "train": "data/MT/bible.train.mgr.0.tn", + }, + "Bible-MSA_bible.msa.0.ms": { + "dev": "data/MT/bible.dev.msa.0.ms", + "test": "data/MT/bible.test.msa.0.ms", + "train": "data/MT/bible.train.msa.0.ms", + }, + "Bible-MSA_bible.msa.1.ms": { + "dev": "data/MT/bible.dev.msa.1.ms", + "test": "data/MT/bible.test.msa.1.ms", + "train": "data/MT/bible.train.msa.1.ms", + }, + "MADAR-Gulf_madar.glf.0.iq": { + "dev": "data/MT/madar.dev.glf.0.qa", + "test": "data/MT/madar.test.glf.0.iq", + "train": "data/MT/madar.train.glf.0.qa", + }, + "MADAR-Gulf_madar.glf.1.iq": { + "dev": "data/MT/madar.dev.glf.0.qa", + "test": "data/MT/madar.test.glf.1.iq", + "train": "data/MT/madar.train.glf.0.qa", + }, + "MADAR-Gulf_madar.glf.2.iq": { + "dev": "data/MT/madar.dev.glf.0.qa", + "test": "data/MT/madar.test.glf.2.iq", + "train": "data/MT/madar.train.glf.0.qa", + }, + "MADAR-Gulf_madar.glf.0.om": { + "dev": "data/MT/madar.dev.glf.0.qa", + "test": "data/MT/madar.test.glf.0.om", + "train": "data/MT/madar.train.glf.0.qa", + }, + "MADAR-Gulf_madar.glf.0.qa": { + "dev": "data/MT/madar.dev.glf.0.qa", + "test": "data/MT/madar.test.glf.0.qa", + "train": "data/MT/madar.train.glf.0.qa", + }, + "MADAR-Gulf_madar.glf.0.sa": { + "dev": "data/MT/madar.dev.glf.0.qa", + "test": "data/MT/madar.test.glf.0.sa", + "train": "data/MT/madar.train.glf.0.qa", + }, + "MADAR-Gulf_madar.glf.1.sa": { + "dev": "data/MT/madar.dev.glf.0.qa", + "test": "data/MT/madar.test.glf.1.sa", + "train": "data/MT/madar.train.glf.0.qa", + }, + "MADAR-Gulf_madar.glf.0.ye": { + "dev": "data/MT/madar.dev.glf.0.qa", + "test": "data/MT/madar.test.glf.0.ye", + "train": "data/MT/madar.train.glf.0.qa", + }, + "MADAR-LEV_madar.lev.0.jo": { + "dev": "data/MT/madar.dev.lev.0.lb", + "test": "data/MT/madar.test.lev.0.jo", + "train": "data/MT/madar.train.lev.0.lb", + }, + "MADAR-LEV_madar.lev.1.jo": { + "dev": "data/MT/madar.dev.lev.0.lb", + "test": "data/MT/madar.test.lev.1.jo", + "train": "data/MT/madar.train.lev.0.lb", + }, + "MADAR-LEV_madar.lev.0.lb": { + "dev": "data/MT/madar.dev.lev.0.lb", + "test": "data/MT/madar.test.lev.0.lb", + "train": "data/MT/madar.train.lev.0.lb", + }, + "MADAR-LEV_madar.lev.0.pa": { + "dev": "data/MT/madar.dev.lev.0.lb", + "test": "data/MT/madar.test.lev.0.pa", + "train": "data/MT/madar.train.lev.0.lb", + }, + "MADAR-LEV_madar.lev.0.sy": { + "dev": "data/MT/madar.dev.lev.0.lb", + "test": "data/MT/madar.test.lev.0.sy", + "train": "data/MT/madar.train.lev.0.lb", + }, + "MADAR-LEV_madar.lev.1.sy": { + "dev": "data/MT/madar.dev.lev.0.lb", + "test": "data/MT/madar.test.lev.1.sy", + "train": "data/MT/madar.train.lev.0.lb", + }, + "MADAR-MGR_madar.mgr.0.dz": { + "dev": "data/MT/madar.dev.mgr.0.ma", + "test": "data/MT/madar.test.mgr.0.dz", + "train": "data/MT/madar.train.mgr.0.ma", + }, + "MADAR-MGR_madar.mgr.0.ly": { + "dev": "data/MT/madar.dev.mgr.0.ma", + "test": "data/MT/madar.test.mgr.0.ly", + "train": "data/MT/madar.train.mgr.0.ma", + }, + "MADAR-MGR_madar.mgr.1.ly": { + "dev": "data/MT/madar.dev.mgr.0.ma", + "test": "data/MT/madar.test.mgr.1.ly", + "train": "data/MT/madar.train.mgr.0.ma", + }, + "MADAR-MGR_madar.mgr.0.ma": { + "dev": "data/MT/madar.dev.mgr.0.ma", + "test": "data/MT/madar.test.mgr.0.ma", + "train": "data/MT/madar.train.mgr.0.ma", + }, + "MADAR-MGR_madar.mgr.1.ma": { + "dev": "data/MT/madar.dev.mgr.0.ma", + "test": "data/MT/madar.test.mgr.1.ma", + "train": "data/MT/madar.train.mgr.0.ma", + }, + "MADAR-MGR_madar.mgr.0.tn": { + "dev": "data/MT/madar.dev.mgr.0.ma", + "test": "data/MT/madar.test.mgr.0.tn", + "train": "data/MT/madar.train.mgr.0.tn", + }, + "MADAR-MGR_madar.mgr.1.tn": { + "dev": "data/MT/madar.dev.mgr.0.ma", + "test": "data/MT/madar.test.mgr.1.tn", + "train": "data/MT/madar.train.mgr.0.tn", + }, + "MADAR-MSA_madar.msa.0.ms": { + "dev": "data/MT/madar.dev.msa.0.ms", + "test": "data/MT/madar.test.msa.0.ms", + "train": "data/MT/madar.train.msa.0.ms", + }, + "MADAR-Nile_madar.nil.0.eg": { + "dev": "data/MT/madar.dev.nil.0.eg", + "test": "data/MT/madar.test.nil.0.eg", + "train": "data/MT/madar.train.nil.0.eg", + }, + "MADAR-Nile_madar.nil.1.eg": { + "dev": "data/MT/madar.dev.nil.0.eg", + "test": "data/MT/madar.test.nil.1.eg", + "train": "data/MT/madar.train.nil.0.eg", + }, + "MADAR-Nile_madar.nil.2.eg": { + "dev": "data/MT/madar.dev.nil.0.eg", + "test": "data/MT/madar.test.nil.2.eg", + "train": "data/MT/madar.train.nil.0.eg", + }, + "MADAR-Nile_madar.nil.0.sd": { + "dev": "data/MT/madar.dev.nil.0.eg", + "test": "data/MT/madar.test.nil.0.sd", + "train": "data/MT/madar.train.nil.0.eg", + }, + "MDC-LEV_ldc_web_eg.lev.0.jo": { + "dev": "data/MT/ldc_web_eg.dev.lev.0.sy", + "test": "data/MT/ldc_web_eg.test.lev.0.jo", + }, + "MDC-LEV_ldc_web_eg.lev.0.ps": { + "dev": "data/MT/ldc_web_eg.dev.lev.0.sy", + "test": "data/MT/ldc_web_eg.test.lev.0.ps", + }, + "MDC-LEV_ldc_web_eg.lev.0.sy": { + "dev": "data/MT/ldc_web_eg.dev.lev.0.sy", + "test": "data/MT/ldc_web_eg.test.lev.0.sy", + }, + "MDC-MGR_ldc_web_eg.mgr.0.tn": { + "test": "data/MT/ldc_web_eg.test.mgr.0.tn", + }, + "MDC-MSA_ldc_web_eg.msa.0.ms": { + "test": "data/MT/ldc_web_eg.test.msa.0.ms", + }, + "Media-Gulf_summa-Oman.glf.0.om": { + "test": "data/MT/summa-Oman.test.glf.0.om", + }, + "Media-LEV_summa-LBC.lev.0.lb": { + "test": "data/MT/summa-LBC.test.lev.0.lb", + }, + "Media-MGR_summa-2M.mgr.0.ma": { + "test": "data/MT/summa-2M.test.mgr.0.ma", + }, + "Media-MSA_summa-AJ.msa.0.ms": { + "test": "data/MT/summa-AJ.test.msa.0.ms", + }, + "Media-MSA_summa-BBC.msa.0.ms": { + "test": "data/MT/summa-BBC.test.msa.0.ms", + }, + "QAraC-Gulf_QAraC.glf.0.qa": { + "dev": "data/MT/QAraC.dev.glf.0.qa", + "test": "data/MT/QAraC.test.glf.0.qa", + }, + "default": [ + "APT-LEV_ldc_web_lv.lev.0.lv", + "APT-Nile_ldc_web_eg.nil.0.eg", + "Bible-MGR_bible.mgr.0.ma", + "Bible-MGR_bible.mgr.0.tn", + "Bible-MSA_bible.msa.0.ms", + "Bible-MSA_bible.msa.1.ms", + "MADAR-Gulf_madar.glf.0.iq", + "MADAR-Gulf_madar.glf.1.iq", + "MADAR-Gulf_madar.glf.2.iq", + "MADAR-Gulf_madar.glf.0.om", + "MADAR-Gulf_madar.glf.0.qa", + "MADAR-Gulf_madar.glf.0.sa", + "MADAR-Gulf_madar.glf.1.sa", + "MADAR-Gulf_madar.glf.0.ye", + "MADAR-LEV_madar.lev.0.jo", + "MADAR-LEV_madar.lev.1.jo", + "MADAR-LEV_madar.lev.0.lb", + "MADAR-LEV_madar.lev.0.pa", + "MADAR-LEV_madar.lev.0.sy", + "MADAR-LEV_madar.lev.1.sy", + "MADAR-MGR_madar.mgr.0.dz", + "MADAR-MGR_madar.mgr.0.ly", + "MADAR-MGR_madar.mgr.1.ly", + "MADAR-MGR_madar.mgr.0.ma", + "MADAR-MGR_madar.mgr.1.ma", + "MADAR-MGR_madar.mgr.0.tn", + "MADAR-MGR_madar.mgr.1.tn", + "MADAR-MSA_madar.msa.0.ms", + "MADAR-Nile_madar.nil.0.eg", + "MADAR-Nile_madar.nil.1.eg", + "MADAR-Nile_madar.nil.2.eg", + "MADAR-Nile_madar.nil.0.sd", + "MDC-LEV_ldc_web_eg.lev.0.jo", + "MDC-LEV_ldc_web_eg.lev.0.ps", + "MDC-LEV_ldc_web_eg.lev.0.sy", + "MDC-MGR_ldc_web_eg.mgr.0.tn", + "MDC-MSA_ldc_web_eg.msa.0.ms", + ], + }, + "task_type": TaskType.SequenceToSequence, } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Sentence in language #1", "label": "Sentence in language #2"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator data = [] - with open(data_path + self.src, "r") as fpsrc, open( - data_path + self.tgt, "r" + with open(f"{data_path}.{self.src_lang}", "r") as fpsrc, open( + f"{data_path}.{self.tgt_lang}", "r" ) as fptgt: for line_idx, (srcline, tgtline) in enumerate(zip(fpsrc, fptgt)): data.append( diff --git a/llmebench/datasets/ArabGend.py b/llmebench/datasets/ArabGend.py index d4763c1b..760a4985 100644 --- a/llmebench/datasets/ArabGend.py +++ b/llmebench/datasets/ArabGend.py @@ -1,26 +1,40 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class ArabGendDataset(DatasetBase): def __init__(self, **kwargs): super(ArabGendDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", - "citation": """@article{mubarak2022arabgend, - title={ArabGend: Gender analysis and inference on {A}rabic Twitter}, - author={Mubarak, Hamdy and Chowdhury, Shammur Absar and Alam, Firoj}, - journal={arXiv preprint arXiv:2203.00271}, - year={2022} + "citation": """@inproceedings{mubarak-etal-2022-arabgend, + title = "{A}rab{G}end: Gender Analysis and Inference on {A}rabic {T}witter", + author = "Mubarak, Hamdy and + Chowdhury, Shammur Absar and + Alam, Firoj", + booktitle = "Proceedings of the Eighth Workshop on Noisy User-generated Text (W-NUT 2022)", + month = oct, + year = "2022", + address = "Gyeongju, Republic of Korea", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.wnut-1.14", + pages = "124--135", + abstract = "Gender analysis of Twitter can reveal important socio-cultural differences between male and female users. There has been a significant effort to analyze and automatically infer gender in the past for most widely spoken languages{'} content, however, to our knowledge very limited work has been done for Arabic. In this paper, we perform an extensive analysis of differences between male and female users on the Arabic Twitter-sphere. We study differences in user engagement, topics of interest, and the gender gap in professions. Along with gender analysis, we also propose a method to infer gender by utilizing usernames, profile pictures, tweets, and networks of friends. In order to do so, we manually annotated gender and locations for {\textasciitilde}166K Twitter accounts associated with {\textasciitilde}92K user location, which we plan to make publicly available. Our proposed gender inference method achieve an F1 score of 82.1{\\%} (47.3{\\%} higher than majority baseline). We also developed a demo and made it publicly available.", }""", + "license": "Research Purpose Only", + "splits": {"test": "data/demographic_attributes/gender/gender-test.txt"}, + "task_type": TaskType.Classification, + "class_labels": ["m", "f"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "A name", "label": "m"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator data = [] with open(data_path, "r") as fp: for line_idx, line in enumerate(fp): diff --git a/llmebench/datasets/ArapTweet.py b/llmebench/datasets/ArapTweet.py index 30c5f140..fd3fda90 100644 --- a/llmebench/datasets/ArapTweet.py +++ b/llmebench/datasets/ArapTweet.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class ArapTweetDataset(DatasetBase): def __init__(self, **kwargs): super(ArapTweetDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -28,9 +30,16 @@ def metadata(): pages={198--204}, year={2019} }""", + "splits": { + "test": "data/demographic_attributes/gender/test-ARAP-unique.txt", + "train": "data/demographic_attributes/gender/train-wajdi.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": ["Female", "Male"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "A name", "label": "m"} def load_data(self, data_path, no_labels=False): diff --git a/llmebench/datasets/BanglaSentiment.py b/llmebench/datasets/BanglaSentiment.py index 9ad8b2e3..6fc045f7 100644 --- a/llmebench/datasets/BanglaSentiment.py +++ b/llmebench/datasets/BanglaSentiment.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class BanglaSentimentDataset(DatasetBase): def __init__(self, **kwargs): super(BanglaSentimentDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "bn", @@ -22,9 +24,18 @@ def metadata(): Year = {2020}, url={https://github.com/banglanlp/bangla-sentiment-classification}, }""", + "link": "https://github.com/banglanlp/bangla-sentiment-classification", + "license": "CC BY-NC-SA 2.0", + "splits": { + "test": "data/sentiment_emotion_others/sentiment/bn/bn_all_test.tsv", + "train": "data/sentiment_emotion_others/sentiment/bn/bn_all_train.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": ["Positive", "Negative", "Neutral"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Tweet", "label": "Positive", "line_number": 0} def load_data(self, data_path): diff --git a/llmebench/datasets/BibleMaghrebiDiacritization.py b/llmebench/datasets/BibleMaghrebiDiacritization.py index 99553d57..e9394ddd 100644 --- a/llmebench/datasets/BibleMaghrebiDiacritization.py +++ b/llmebench/datasets/BibleMaghrebiDiacritization.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class BibleMaghrebiDiacritizationDataset(DatasetBase): def __init__(self, **kwargs): super(BibleMaghrebiDiacritizationDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -22,9 +24,22 @@ def metadata(): isbn = {979-10-95546-25-2}, language = {english} }""", + "splits": { + "morrocan_f05": { + "test": "data/sequence_tagging_ner_pos_etc/diacritization/morrocan_f05.test.src-trg.txt", + "dev": "data/sequence_tagging_ner_pos_etc/diacritization/morrocan_f05.dev.src-trg.txt", + }, + "tunisian_f05": { + "test": "data/sequence_tagging_ner_pos_etc/diacritization/tunisian_f05.test.src-trg.txt", + "dev": "data/sequence_tagging_ner_pos_etc/diacritization/tunisian_f05.dev.src-trg.txt", + }, + "default": ["morrocan_f05", "tunisian_f05"], + }, + "task_type": TaskType.Other, } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "Original sentence", "label": "Sentence with diacritized words", diff --git a/llmebench/datasets/COVID19Factuality.py b/llmebench/datasets/COVID19Factuality.py index a3e48a61..02985cc5 100644 --- a/llmebench/datasets/COVID19Factuality.py +++ b/llmebench/datasets/COVID19Factuality.py @@ -1,15 +1,18 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class COVID19FactualityDataset(DatasetBase): def __init__(self, **kwargs): super(COVID19FactualityDataset, self).__init__(**kwargs) - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "some tweet", "label": "no"} + @staticmethod def metadata(): return { "language": "ar", @@ -41,6 +44,13 @@ def metadata(): doi = "10.18653/v1/2021.findings-emnlp.56", pages = "611--649", }""", + "license": "CC BY NC SA 4.0", + "splits": { + "test": "data/factuality_disinformation_harmful_content/factuality_covid19/covid19_infodemic_arabic_data_factuality_binary_test.tsv", + "train": "data/factuality_disinformation_harmful_content/factuality_covid19/covid19_infodemic_arabic_data_factuality_binary_train.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": ["yes", "no"], } def load_data(self, data_path): diff --git a/llmebench/datasets/CT22Attentionworthy.py b/llmebench/datasets/CT22Attentionworthy.py index 8119b977..5dc88619 100644 --- a/llmebench/datasets/CT22Attentionworthy.py +++ b/llmebench/datasets/CT22Attentionworthy.py @@ -1,15 +1,18 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class CT22AttentionworthyDataset(DatasetBase): def __init__(self, **kwargs): super(CT22AttentionworthyDataset, self).__init__(**kwargs) - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "some tweet", "label": "no_not_interesting"} + @staticmethod def metadata(): return { "language": ["ar", "bg", "nl", "en", "tr"], @@ -21,6 +24,26 @@ def metadata(): series = {CLEF~'2022}, address = {Bologna, Italy}, }""", + "link": "https://gitlab.com/checkthat_lab/clef2022-checkthat-lab/clef2022-checkthat-lab", + "license": "Research Purpose Only", + "splits": { + "ar": { + "test": "data/factuality_disinformation_harmful_content/attentionworthy/CT22_arabic_1D_attentionworthy_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/attentionworthy/CT22_arabic_1D_attentionworthy_train.tsv", + } + }, + "task_type": TaskType.Classification, + "class_labels": [ + "yes_discusses_action_taken", + "harmful", + "yes_discusses_cure", + "yes_asks_question", + "no_not_interesting", + "yes_other", + "yes_blame_authorities", + "yes_contains_advice", + "yes_calls_for_action", + ], } def load_data(self, data_path): diff --git a/llmebench/datasets/CT22Checkworthiness.py b/llmebench/datasets/CT22Checkworthiness.py index b7d9806c..7c855889 100644 --- a/llmebench/datasets/CT22Checkworthiness.py +++ b/llmebench/datasets/CT22Checkworthiness.py @@ -1,13 +1,15 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class CT22CheckworthinessDataset(DatasetBase): def __init__(self, **kwargs): super(CT22CheckworthinessDataset, self).__init__(**kwargs) - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "some tweet", "label": "1", @@ -15,6 +17,7 @@ def get_data_sample(self): "line_number": 0, } + @staticmethod def metadata(): return { "language": ["ar", "bg", "nl", "en", "es", "tr"], @@ -26,6 +29,36 @@ def metadata(): year={2022}, organization={Springer} }""", + "link": "https://gitlab.com/checkthat_lab/clef2022-checkthat-lab/clef2022-checkthat-lab", + "license": "Research Purpose Only", + "splits": { + "ar": { + "test": "data/factuality_disinformation_harmful_content/checkworthyness/arabic/CT22_arabic_1A_checkworthy_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/checkworthyness/arabic/CT22_arabic_1A_checkworthy_train.tsv", + }, + "bg": { + "test": "data/factuality_disinformation_harmful_content/checkworthyness/bulgarian/CT22_bulgarian_1A_checkworthy_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/checkworthyness/bulgarian/CT22_bulgarian_1A_checkworthy_train.tsv", + }, + "en": { + "test": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_train.tsv", + }, + "es": { + "test": "data/factuality_disinformation_harmful_content/checkworthyness/spanish/CT22_spanish_1A_checkworthy_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/checkworthyness/spanish/CT22_spanish_1A_checkworthy_train.tsv", + }, + "nl": { + "test": "data/factuality_disinformation_harmful_content/checkworthyness/dutch/CT22_dutch_1A_checkworthy_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/checkworthyness/dutch/CT22_dutch_1A_checkworthy_train.tsv", + }, + "tr": { + "test": "data/factuality_disinformation_harmful_content/checkworthyness/turkish/CT22_turkish_1A_checkworthy_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/checkworthyness/turkish/CT22_turkish_1A_checkworthy_train.tsv", + }, + }, + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], } def load_data(self, data_path): diff --git a/llmebench/datasets/CT22Claim.py b/llmebench/datasets/CT22Claim.py index 83e8faf2..97c5a47d 100644 --- a/llmebench/datasets/CT22Claim.py +++ b/llmebench/datasets/CT22Claim.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class CT22ClaimDataset(DatasetBase): def __init__(self, **kwargs): super(CT22ClaimDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": ["ar", "bg", "nl", "en", "tr"], @@ -14,9 +16,20 @@ def metadata(): year={2022}, booktitle={Proceedings of the Working Notes of CLEF 2022 - Conference and Labs of the Evaluation Forum} }""", + "link": "https://gitlab.com/checkthat_lab/clef2022-checkthat-lab/clef2022-checkthat-lab", + "license": "Research Purpose Only", + "splits": { + "ar": { + "test": "data/factuality_disinformation_harmful_content/claim_covid19/CT22_arabic_1B_claim_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/claim_covid19/CT22_arabic_1B_claim_train.tsv", + } + }, + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Tweet", "label": "1"} def load_data(self, data_path): diff --git a/llmebench/datasets/CT22Harmful.py b/llmebench/datasets/CT22Harmful.py index c19049e5..955dc776 100644 --- a/llmebench/datasets/CT22Harmful.py +++ b/llmebench/datasets/CT22Harmful.py @@ -1,22 +1,35 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class CT22HarmfulDataset(DatasetBase): def __init__(self, **kwargs): super(CT22HarmfulDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": ["ar", "bg", "nl", "en", "tr"], "citation": """@inproceedings{nakov2022overview, - title={Overview of the CLEF-2022 CheckThat! lab task 1 on identifying relevant claims in tweets}, - author={Nakov, Preslav and Barr{\\'o}n-Cede{\\~n}o, Alberto and Da San Martino, Giovanni and Alam, Firoj and Kutlu, Mucahid and Zaghouani, Wajdi and Li, Chengkai and Shaar, Shaden and Mubarak, Hamdy and Nikolov, Alex}, - year={2022}, - booktitle={Proceedings of the Working Notes of CLEF 2022 - Conference and Labs of the Evaluation Forum} - }""", + title={Overview of the CLEF-2022 CheckThat! lab task 1 on identifying relevant claims in tweets}, + author={Nakov, Preslav and Barr{\\'o}n-Cede{\\~n}o, Alberto and Da San Martino, Giovanni and Alam, Firoj and Kutlu, Mucahid and Zaghouani, Wajdi and Li, Chengkai and Shaar, Shaden and Mubarak, Hamdy and Nikolov, Alex}, + year={2022}, + booktitle={Proceedings of the Working Notes of CLEF 2022 - Conference and Labs of the Evaluation Forum} + }""", + "link": "https://gitlab.com/checkthat_lab/clef2022-checkthat-lab/clef2022-checkthat-lab", + "license": "Research Purpose Only", + "splits": { + "ar": { + "test": "data/factuality_disinformation_harmful_content/harmful/CT22_arabic_1C_harmful_test_gold.tsv", + "train": "data/factuality_disinformation_harmful_content/harmful/CT22_arabic_1C_harmful_train.tsv", + }, + }, + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Tweet", "label": "1"} def load_data(self, data_path): diff --git a/llmebench/datasets/CT23Subjectivity.py b/llmebench/datasets/CT23Subjectivity.py index b1ba7fe4..12d0d7a7 100644 --- a/llmebench/datasets/CT23Subjectivity.py +++ b/llmebench/datasets/CT23Subjectivity.py @@ -1,15 +1,18 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class CT23SubjectivityDataset(DatasetBase): def __init__(self, **kwargs): super(CT23SubjectivityDataset, self).__init__(**kwargs) - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "some tweet", "label": "SUBJ"} + @staticmethod def metadata(): return { "language": "ar", @@ -21,6 +24,16 @@ def metadata(): year={2023}, organization={Springer} }""", + "link": "https://gitlab.com/checkthat_lab/clef2023-checkthat-lab", + "license": "CC BY NC SA 4.0", + "splits": { + "ar": { + "dev": "data/factuality_disinformation_harmful_content/subjectivity/dev_ar.tsv", + "train": "data/factuality_disinformation_harmful_content/subjectivity/train_ar.tsv", + } + }, + "task_type": TaskType.Classification, + "class_labels": ["SUBJ", "OBJ"], } def load_data(self, data_path): diff --git a/llmebench/datasets/Emotion.py b/llmebench/datasets/Emotion.py index 095bbeb3..a9b68fac 100644 --- a/llmebench/datasets/Emotion.py +++ b/llmebench/datasets/Emotion.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class EmotionDataset(DatasetBase): def __init__(self, **kwargs): super(EmotionDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -22,13 +24,33 @@ def metadata(): pages = "6948--6958", abstract = "Emotion detection can provide us with a window into understanding human behavior. Due to the complex dynamics of human emotions, however, constructing annotated datasets to train automated models can be expensive. Thus, we explore the efficacy of cross-lingual approaches that would use data from a source language to build models for emotion detection in a target language. We compare three approaches, namely: i) using inherently multilingual models; ii) translating training data into the target language; and iii) using an automatically tagged parallel corpus. In our study, we consider English as the source language with Arabic and Spanish as target languages. We study the effectiveness of different classification models such as BERT and SVMs trained with different features. Our BERT-based monolingual models that are trained on target language data surpass state-of-the-art (SOTA) by 4{\\%} and 5{\\%} absolute Jaccard score for Arabic and Spanish respectively. Next, we show that using cross-lingual approaches with English data alone, we can achieve more than 90{\\%} and 80{\\%} relative effectiveness of the Arabic and Spanish BERT models respectively. Lastly, we use LIME to analyze the challenges of training cross-lingual models for different language pairs.", }""", + "link": "https://competitions.codalab.org/competitions/17751", + "license": "Restricted", + "download_url": "http://saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/SemEval2018-Task1-all-data.zip", + "splits": { + "test": "data/sentiment_emotion_others/emotion/test-gold.txt", + "train": "data/sentiment_emotion_others/emotion/train.txt", + }, + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "anger", + "disgust", + "fear", + "joy", + "love", + "optimism", + "pessimism", + "sadness", + "surprise", + "trust", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Tweet", "label": [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator data = [] with open(data_path, "r") as fp: for line_idx, line in enumerate(fp): diff --git a/llmebench/datasets/Location.py b/llmebench/datasets/Location.py index f280445e..d6ab5cf3 100644 --- a/llmebench/datasets/Location.py +++ b/llmebench/datasets/Location.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class LocationDataset(DatasetBase): def __init__(self, **kwargs): super(LocationDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -15,13 +17,43 @@ def metadata(): pages={145--153}, year={2021} }""", + "link": "https://alt.qcri.org/resources/UL2C-UserLocationsToCountries.tsv", + "splits": { + "test": "data/demographic_attributes/location/arab+others.txt", + "train": "data/demographic_attributes/location/dev.txt", + }, + "task_type": TaskType.Classification, + "class_labels": [ + "ae", + "OTHERS", + "bh", + "dz", + "eg", + "iq", + "jo", + "kw", + "lb", + "ly", + "ma", + "om", + "ps", + "qa", + "sa", + "sd", + "so", + "sy", + "tn", + "UNK", + "ye", + "mr", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Doha, Qatar", "label": "QA"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator # Format: location \t country_code data = [] with open(data_path, "r") as fp: diff --git a/llmebench/datasets/MGBWords.py b/llmebench/datasets/MGBWords.py index a8294157..0a40f357 100644 --- a/llmebench/datasets/MGBWords.py +++ b/llmebench/datasets/MGBWords.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class MGBWordsDataset(DatasetBase): def __init__(self, **kwargs): super(MGBWordsDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -24,9 +26,27 @@ def metadata(): pages = "2274--2285", abstract = "We introduce the largest transcribed Arabic speech corpus, QASR, collected from the broadcast domain. This multi-dialect speech dataset contains 2,000 hours of speech sampled at 16kHz crawled from Aljazeera news channel. The dataset is released with lightly supervised transcriptions, aligned with the audio segments. Unlike previous datasets, QASR contains linguistically motivated segmentation, punctuation, speaker information among others. QASR is suitable for training and evaluating speech recognition systems, acoustics- and/or linguistics- based Arabic dialect identification, punctuation restoration, speaker identification, speaker linking, and potentially other NLP modules for spoken data. In addition to QASR transcription, we release a dataset of 130M words to aid in designing and training a better language model. We show that end-to-end automatic speech recognition trained on QASR reports a competitive word error rate compared to the previous MGB-2 corpus. We report baseline results for downstream natural language processing tasks such as named entity recognition using speech transcript. We also report the first baseline for Arabic punctuation restoration. We make the corpus available for the research community.", }""", + "link": "https://alt.qcri.org/resources/MGB-words.txt", + "license": "Research Purpose Only", + "splits": { + "test": "data/sequence_tagging_ner_pos_etc/NER/mgb/MGB-words.txt" + }, + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "B-PERS", + "I-PERS", + "B-LOC", + "I-LOC", + "B-ORG", + "I-ORG", + "B-MISC", + "I-MISC", + "O", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "sentence", "label": "named entity labels are here"} def load_data(self, data_path, no_labels=False): diff --git a/llmebench/datasets/MLQA.py b/llmebench/datasets/MLQA.py index 4aa618b7..d90eb298 100644 --- a/llmebench/datasets/MLQA.py +++ b/llmebench/datasets/MLQA.py @@ -1,12 +1,12 @@ -import json - from llmebench.datasets.SQuADBase import SQuADBase +from llmebench.tasks import TaskType class MLQADataset(SQuADBase): def __init__(self, **kwargs): super(MLQADataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -27,4 +27,11 @@ def metadata(): pages = "7315--7330", abstract = "Question answering (QA) models have shown rapid progress enabled by the availability of large, high-quality benchmark datasets. Such annotated datasets are difficult and costly to collect, and rarely exist in languages other than English, making building QA systems that work well in other languages challenging. In order to develop such systems, it is crucial to invest in high quality multilingual evaluation benchmarks to measure progress. We present MLQA, a multi-way aligned extractive QA evaluation benchmark intended to spur research in this area. MLQA contains QA instances in 7 languages, English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA has over 12K instances in English and 5K in each other language, with each instance parallel between 4 languages on average. We evaluate state-of-the-art cross-lingual models and machine-translation-based baselines on MLQA. In all cases, transfer results are shown to be significantly behind training-language performance.", }""", + "link": "https://github.com/facebookresearch/mlqa", + "license": "CC BY-NC 4.0", + "splits": { + "dev": "data/QA/MLQA/dev/dev-context-ar-question-ar.json", + "test": "data/QA/MLQA/test/test-context-ar-question-ar.json", + }, + "task_type": TaskType.QuestionAnswering, } diff --git a/llmebench/datasets/NameInfo.py b/llmebench/datasets/NameInfo.py index 02735d3b..b876fe73 100644 --- a/llmebench/datasets/NameInfo.py +++ b/llmebench/datasets/NameInfo.py @@ -1,21 +1,130 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class NameInfoDataset(DatasetBase): def __init__(self, **kwargs): super(NameInfoDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", "citation": """@inproceedings{Under review...}""", + "splits": { + "test": "data/demographic_attributes/name_info/wikidata_test.txt", + "train": "data/demographic_attributes/name_info/dev.txt", + }, + "task_type": TaskType.Classification, + "class_labels": [ + "gb", + "us", + "cl", + "fr", + "ru", + "pl", + "in", + "it", + "kr", + "gh", + "ca", + "sa", + "at", + "de", + "cn", + "br", + "dk", + "se", + "bd", + "cu", + "jp", + "be", + "es", + "co", + "id", + "iq", + "pk", + "tr", + "il", + "ch", + "ar", + "ro", + "nl", + "ps", + "ug", + "ir", + "cg", + "do", + "ee", + "tn", + "gr", + "np", + "ie", + "sy", + "hu", + "eg", + "ma", + "ve", + "ph", + "no", + "bg", + "si", + "ke", + "au", + "et", + "py", + "af", + "pt", + "th", + "bo", + "mx", + "lb", + "za", + "fi", + "hr", + "vn", + "ly", + "nz", + "qa", + "kh", + "ci", + "ng", + "sg", + "cm", + "dz", + "tz", + "ae", + "pe", + "az", + "lu", + "ec", + "cz", + "ua", + "uy", + "sd", + "ao", + "my", + "lv", + "kw", + "tw", + "bh", + "lk", + "ye", + "cr", + "jo", + "pa", + "om", + "uz", + "by", + "kz", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "جورج واشنطن", "label": "GB"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator # Format: # جورج واشنطن United Kingdom GB data = [] diff --git a/llmebench/datasets/OSACT4SubtaskA.py b/llmebench/datasets/OSACT4SubtaskA.py index 45ab5c13..852ab9d0 100644 --- a/llmebench/datasets/OSACT4SubtaskA.py +++ b/llmebench/datasets/OSACT4SubtaskA.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class OSACT4SubtaskADataset(DatasetBase): def __init__(self, **kwargs): super(OSACT4SubtaskADataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -31,13 +33,21 @@ def metadata(): abstract = "We present the results and the main findings of SemEval-2020 Task 12 on Multilingual Offensive Language Identification in Social Media (OffensEval-2020). The task included three subtasks corresponding to the hierarchical taxonomy of the OLID schema from OffensEval-2019, and it was offered in five languages: Arabic, Danish, English, Greek, and Turkish. OffensEval-2020 was one of the most popular tasks at SemEval-2020, attracting a large number of participants across all subtasks and languages: a total of 528 teams signed up to participate in the task, 145 teams submitted official runs on the test data, and 70 teams submitted system description papers.", } """, + "link": "https://edinburghnlp.inf.ed.ac.uk/workshops/OSACT4/", + "license": "CC BY 4.0", + "splits": { + "test": "data/factuality_disinformation_harmful_content/offensive_language/OSACT2020-sharedTask-test-tweets-labels.txt", + "train": "data/factuality_disinformation_harmful_content/offensive_language/OSACT2020-sharedTask-train_OFF.txt", + }, + "task_type": TaskType.Classification, + "class_labels": ["OFF", "NOT_OFF"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "@USER يلا يا خوخة يا مهزئة ع دراستك", "label": "OFF"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator # Format: text \t offensive_label data = [] with open(data_path, "r") as fp: diff --git a/llmebench/datasets/OSACT4SubtaskB.py b/llmebench/datasets/OSACT4SubtaskB.py index 784e6a95..a79c0ee7 100644 --- a/llmebench/datasets/OSACT4SubtaskB.py +++ b/llmebench/datasets/OSACT4SubtaskB.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class OSACT4SubtaskBDataset(DatasetBase): def __init__(self, **kwargs): super(OSACT4SubtaskBDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -15,13 +17,21 @@ def metadata(): pages={48--52}, year={2020} }""", + "link": "https://edinburghnlp.inf.ed.ac.uk/workshops/OSACT4/", + "license": "CC BY 4.0", + "splits": { + "test": "data/factuality_disinformation_harmful_content/hate_speech/OSACT2020-sharedTask-test-tweets-labels.txt", + "train": "data/factuality_disinformation_harmful_content/hate_speech/OSACT2020-sharedTask-train_HS.txt", + }, + "task_type": TaskType.Classification, + "class_labels": ["HS", "NOT_HS"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "ايه اللي انت بتقوله ده يا اوروبي يا متخلف", "label": "HS"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator # Format: text \t hatespeech_label data = [] with open(data_path, "r") as fp: diff --git a/llmebench/datasets/PADT.py b/llmebench/datasets/PADT.py index baf7ee36..5aea8424 100644 --- a/llmebench/datasets/PADT.py +++ b/llmebench/datasets/PADT.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class PADTDataset(DatasetBase): def __init__(self, **kwargs): super(PADTDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -34,9 +36,16 @@ def metadata(): publisher={2004a} } """, + "link": "https://ufal.mff.cuni.cz/padt/PADT_1.0/docs/index.html", + "splits": { + "test": "data/sequence_tagging_ner_pos_etc/parsing/arabic_PADT_test_gs.conll", + "train": "data/sequence_tagging_ner_pos_etc/parsing/arabic_PADT_train.conll", + }, + "task_type": TaskType.Other, } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "Original sentence", "label": { diff --git a/llmebench/datasets/QADI.py b/llmebench/datasets/QADI.py index f436b494..711d08e1 100644 --- a/llmebench/datasets/QADI.py +++ b/llmebench/datasets/QADI.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class QADIDataset(DatasetBase): def __init__(self, **kwargs): super(QADIDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -15,13 +17,40 @@ def metadata(): pages={1--10}, year={2021} }""", + "link": "https://alt.qcri.org/resources/qadi/", + "license": "Apache License, Version 2.0", + "splits": { + "test": "data/sequence_tagging_ner_pos_etc/dialect_identification/QADI_test-PalestinePS-corrected.txt" + }, + "task_type": TaskType.Classification, + "class_labels": [ + "EG", + "DZ", + "SD", + "YE", + "SY", + "TN", + "AE", + "JO", + "LY", + "PS", + "OM", + "LB", + "KW", + "QA", + "BH", + "MSA", + "SA", + "IQ", + "MA", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "طب ماتمشي هو حد ماسك فيك", "label": "EG"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator # Format: dialect_id_label \t text data = [] with open(data_path, "r") as fp: diff --git a/llmebench/datasets/QCRIDialectalArabicPOS.py b/llmebench/datasets/QCRIDialectalArabicPOS.py index 573a3a8d..8146fa26 100644 --- a/llmebench/datasets/QCRIDialectalArabicPOS.py +++ b/llmebench/datasets/QCRIDialectalArabicPOS.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class QCRIDialectalArabicPOSDataset(DatasetBase): def __init__(self, **kwargs): super(QCRIDialectalArabicPOSDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -22,9 +24,55 @@ def metadata(): isbn = {979-10-95546-00-9}, language = {english} }""", + "link": "https://alt.qcri.org/resources/da_resources/", + "license": "Apache License, Version 2.0", + "splits": { + "glf.data_5": { + "dev": "data/sequence_tagging_ner_pos_etc/POS/glf.pos/glf.data_5.dev.src-trg.sent", + "test": "data/sequence_tagging_ner_pos_etc/POS/glf.pos/glf.data_5.test.src-trg.sent", + }, + "lev.data_5": { + "dev": "data/sequence_tagging_ner_pos_etc/POS/lev.pos/lev.data_5.dev.src-trg.sent", + "test": "data/sequence_tagging_ner_pos_etc/POS/lev.pos/lev.data_5.test.src-trg.sent", + }, + "egy.data_5": { + "dev": "data/sequence_tagging_ner_pos_etc/POS/egy.pos/egy.data_5.dev.src-trg.sent", + "test": "data/sequence_tagging_ner_pos_etc/POS/egy.pos/egy.data_5.test.src-trg.sent", + }, + "mgr.data_5": { + "dev": "data/sequence_tagging_ner_pos_etc/POS/mgr.pos/mgr.data_5.dev.src-trg.sent", + "test": "data/sequence_tagging_ner_pos_etc/POS/mgr.pos/mgr.data_5.test.src-trg.sent", + }, + "default": ["glf.data_5", "lev.data_5", "egy.data_5", "mgr.data_5"], + }, + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "ADJ", + "ADV", + "CASE", + "CONJ", + "DET", + "EMOT", + "FOREIGN", + "FUT_PART", + "HASH", + "MENTION", + "NEG_PART", + "NOUN", + "NSUFF", + "NUM", + "PART", + "PREP", + "PROG_PART", + "PRON", + "PUNC", + "URL", + "V", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "Original sentence", "label": "Sentence with POS tags", diff --git a/llmebench/datasets/QCRIDialectalArabicSegmentation.py b/llmebench/datasets/QCRIDialectalArabicSegmentation.py index 9293c4d7..4192edc4 100644 --- a/llmebench/datasets/QCRIDialectalArabicSegmentation.py +++ b/llmebench/datasets/QCRIDialectalArabicSegmentation.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class QCRIDialectalArabicSegmentationDataset(DatasetBase): def __init__(self, **kwargs): super(QCRIDialectalArabicSegmentationDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -26,16 +28,38 @@ def metadata(): doi = "10.18653/v1/K17-1043", pages = "432--441" }""", + "link": "https://alt.qcri.org/resources/da_resources/", + "license": "Apache License, Version 2.0", + "splits": { + "glf.data_5": { + "dev": "data/sequence_tagging_ner_pos_etc/segmentation/glf.seg/glf.data_5.dev.src.sent", + "test": "data/sequence_tagging_ner_pos_etc/segmentation/glf.seg/glf.data_5.test.src.sent", + }, + "lev.data_5": { + "dev": "data/sequence_tagging_ner_pos_etc/segmentation/lev.seg/lev.data_5.dev.src.sent", + "test": "data/sequence_tagging_ner_pos_etc/segmentation/lev.seg/lev.data_5.test.src.sent", + }, + "egy.data_5": { + "dev": "data/sequence_tagging_ner_pos_etc/segmentation/egy.seg/egy.data_5.dev.src.sent", + "test": "data/sequence_tagging_ner_pos_etc/segmentation/egy.seg/egy.data_5.test.src.sent", + }, + "mgr.data_5": { + "dev": "data/sequence_tagging_ner_pos_etc/segmentation/mgr.seg/mgr.data_5.dev.src.sent", + "test": "data/sequence_tagging_ner_pos_etc/segmentation/mgr.seg/mgr.data_5.test.src.sent", + }, + "default": ["glf.data_5", "lev.data_5", "egy.data_5", "mgr.data_5"], + }, + "task_type": TaskType.Other, } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "Original sentence", "label": "Sentence with segmented words", } def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator data = [] with open(data_path, "r") as fp: diff --git a/llmebench/datasets/NewsCatAlKhaleej.py b/llmebench/datasets/SANADAkhbarona.py similarity index 57% rename from llmebench/datasets/NewsCatAlKhaleej.py rename to llmebench/datasets/SANADAkhbarona.py index f016952b..7ca79c6e 100644 --- a/llmebench/datasets/NewsCatAlKhaleej.py +++ b/llmebench/datasets/SANADAkhbarona.py @@ -1,15 +1,18 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType -class NewsCatAlKhaleejDataset(DatasetBase): +class SANADAkhbaronaDataset(DatasetBase): def __init__(self, **kwargs): - super(NewsCatAlKhaleejDataset, self).__init__(**kwargs) + super(SANADAkhbaronaDataset, self).__init__(**kwargs) - def get_data_sample(self): - return {"input": "some tweet", "label": "checkworthy"} + @staticmethod + def get_data_sample(): + return {"input": "some tweet", "label": "tech"} + @staticmethod def metadata(): return { "language": "ar", @@ -22,6 +25,22 @@ def metadata(): year={2019}, publisher={Elsevier} }""", + "link": "https://data.mendeley.com/datasets/57zpx667y9/2", + "license": "CC BY 4.0", + "splits": { + "test": "data/news_categorization/SANAD_akhbarona_news_cat_test.tsv", + "train": "data/news_categorization/SANAD_akhbarona_news_cat_train.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": [ + "politics", + "religion", + "medical", + "sports", + "tech", + "finance", + "culture", + ], } def load_data(self, data_path): diff --git a/llmebench/datasets/NewsCatAlArabiya.py b/llmebench/datasets/SANADAlArabiya.py similarity index 58% rename from llmebench/datasets/NewsCatAlArabiya.py rename to llmebench/datasets/SANADAlArabiya.py index 050048c7..badb34f2 100644 --- a/llmebench/datasets/NewsCatAlArabiya.py +++ b/llmebench/datasets/SANADAlArabiya.py @@ -1,15 +1,18 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType -class NewsCatAlArabiyaDataset(DatasetBase): +class SANADAlArabiyaDataset(DatasetBase): def __init__(self, **kwargs): - super(NewsCatAlArabiyaDataset, self).__init__(**kwargs) + super(SANADAlArabiyaDataset, self).__init__(**kwargs) - def get_data_sample(self): - return {"input": "some tweet", "label": "checkworthy"} + @staticmethod + def get_data_sample(): + return {"input": "some tweet", "label": "tech"} + @staticmethod def metadata(): return { "language": "ar", @@ -22,6 +25,21 @@ def metadata(): year={2019}, publisher={Elsevier} }""", + "link": "https://data.mendeley.com/datasets/57zpx667y9/2", + "license": "CC BY 4.0", + "splits": { + "test": "data/news_categorization/SANAD_alarabiya_news_cat_test.tsv", + "train": "data/news_categorization/SANAD_alarabiya_news_cat_train.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": [ + "politics", + "medical", + "sports", + "tech", + "finance", + "culture", + ], } def load_data(self, data_path): diff --git a/llmebench/datasets/NewsCatAkhbarona.py b/llmebench/datasets/SANADAlKhaleej.py similarity index 57% rename from llmebench/datasets/NewsCatAkhbarona.py rename to llmebench/datasets/SANADAlKhaleej.py index 30ea3fd7..e16e1c75 100644 --- a/llmebench/datasets/NewsCatAkhbarona.py +++ b/llmebench/datasets/SANADAlKhaleej.py @@ -1,15 +1,18 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType -class NewsCatAkhbaronaDataset(DatasetBase): +class SANADAlKhaleejDataset(DatasetBase): def __init__(self, **kwargs): - super(NewsCatAkhbaronaDataset, self).__init__(**kwargs) + super(SANADAlKhaleejDataset, self).__init__(**kwargs) - def get_data_sample(self): - return {"input": "some tweet", "label": "checkworthy"} + @staticmethod + def get_data_sample(): + return {"input": "some tweet", "label": "tech"} + @staticmethod def metadata(): return { "language": "ar", @@ -22,6 +25,22 @@ def metadata(): year={2019}, publisher={Elsevier} }""", + "link": "https://data.mendeley.com/datasets/57zpx667y9/2", + "license": "CC BY 4.0", + "splits": { + "test": "data/news_categorization/SANAD_alkhaleej_news_cat_test.tsv", + "train": "data/news_categorization/SANAD_alkhaleej_news_cat_train.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": [ + "politics", + "religion", + "medical", + "sports", + "tech", + "finance", + "culture", + ], } def load_data(self, data_path): diff --git a/llmebench/datasets/SQuADBase.py b/llmebench/datasets/SQuADBase.py index 5ff00d06..8cf9a7c0 100644 --- a/llmebench/datasets/SQuADBase.py +++ b/llmebench/datasets/SQuADBase.py @@ -7,7 +7,8 @@ class SQuADBase(DatasetBase): def __init__(self, **kwargs): super(SQuADBase, self).__init__(**kwargs) - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": { "context": "context for the questions. Usually a snippet of a wikipedia article", diff --git a/llmebench/datasets/STSQ2Q.py b/llmebench/datasets/STSQ2Q.py index 7451f442..080b6c69 100644 --- a/llmebench/datasets/STSQ2Q.py +++ b/llmebench/datasets/STSQ2Q.py @@ -1,12 +1,14 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class Q2QSimDataset(DatasetBase): def __init__(self, **kwargs): super(Q2QSimDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -17,9 +19,17 @@ def metadata(): pages={1--8}, year={2019} }""", + "link": "http://nsurl.org/2019-2/tasks/task8-semantic-question-similarity-in-arabic/", + "splits": { + "test": "data/STS/nsurl-2019-task8/test.tsv", + "train": "data/STS/nsurl-2019-task8/train.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "السوال الاول السوال الثاني\tالسوال الاول السوال الثاني", "label": "1", diff --git a/llmebench/datasets/SemEval17T1STS.py b/llmebench/datasets/SemEval17T1STS.py index e5412f4f..65e07e24 100644 --- a/llmebench/datasets/SemEval17T1STS.py +++ b/llmebench/datasets/SemEval17T1STS.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class SemEval17T1STSDataset(DatasetBase): def __init__(self, **kwargs): super(SemEval17T1STSDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -15,9 +17,20 @@ def metadata(): pages={1--14}, year={2017} }""", + "link": "https://alt.qcri.org/semeval2017/task1/index.php", + "splits": { + "test": { + "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track1.ar-ar.txt", + "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track1.ar-ar.txt", + }, + "train": "data/STS/semeval-2017/ar_sts_data_updated/Ar_STS/ar.STS.All.txt", + }, + "task_type": TaskType.Regression, + "score_range": (0, 5), } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "الجملة بالعربية\tالجملة بالعربية", "label": 1.2} def load_train_data(self, data_path): diff --git a/llmebench/datasets/SemEval17T2STS.py b/llmebench/datasets/SemEval17T2STS.py index 8b916880..241ab0e3 100644 --- a/llmebench/datasets/SemEval17T2STS.py +++ b/llmebench/datasets/SemEval17T2STS.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class SemEval17T2STSDataset(DatasetBase): def __init__(self, **kwargs): super(SemEval17T2STSDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -15,9 +17,20 @@ def metadata(): pages={1--14}, year={2017} }""", + "link": "https://alt.qcri.org/semeval2017/task2/index.php", + "splits": { + "test": { + "sentences_path": "data/STS/semeval-2017/STS2017.eval.v1.1/STS.input.track2.ar-en.txt", + "gt_data_path": "data/STS/semeval-2017/STS2017.gs/STS.gs.track2.ar-en.txt", + }, + "train": "data/STS/semeval-2017/ar_sts_data_updated/En_Ar_STS/en_ar.STS.All.txt", + }, + "task_type": TaskType.Regression, + "score_range": (0, 5), } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "الجملة بالعربية\tالجملة english", "label": 1.2} def load_train_data(self, data_path): diff --git a/llmebench/datasets/SemEval23T3Propaganda.py b/llmebench/datasets/SemEval23T3Propaganda.py index 5cbcb671..e1d17a5b 100644 --- a/llmebench/datasets/SemEval23T3Propaganda.py +++ b/llmebench/datasets/SemEval23T3Propaganda.py @@ -2,6 +2,7 @@ from pathlib import Path from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class SemEval23T3PropagandaDataset(DatasetBase): @@ -10,6 +11,7 @@ def __init__(self, techniques_path=None, **kwargs): self.techniques_path = Path(techniques_path) if techniques_path else None super(SemEval23T3PropagandaDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": ["en", "es", "fr", "de", "el", "it", "ka", "pl", "ru"], @@ -29,9 +31,64 @@ def metadata(): pages = "2343--2361", abstract = "We describe SemEval-2023 task 3 on Detecting the Category, the Framing, and the Persuasion Techniques in Online News in a Multilingual Setup: the dataset, the task organization process, the evaluation setup, the results, and the participating systems. The task focused on news articles in nine languages (six known to the participants upfront: English, French, German, Italian, Polish, and Russian), and three additional ones revealed to the participants at the testing phase: Spanish, Greek, and Georgian). The task featured three subtasks: (1) determining the genre of the article (opinion, reporting, or satire), (2) identifying one or more frames used in an article from a pool of 14 generic frames, and (3) identify the persuasion techniques used in each paragraph of the article, using a taxonomy of 23 persuasion techniques. This was a very popular task: a total of 181 teams registered to participate, and 41 eventually made an official submission on the test set.", }""", + "link": "https://propaganda.math.unipd.it/semeval2023task3/", + "splits": { + "de": { + "dev": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ge_dev_subtask3.json", + "train": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ge_train_subtask3.json", + }, + "en": { + "dev": "data/factuality_disinformation_harmful_content/propaganda_semeval23/en_dev_subtask3.json", + "train": "data/factuality_disinformation_harmful_content/propaganda_semeval23/en_train_subtask3.json", + }, + "fr": { + "dev": "data/factuality_disinformation_harmful_content/propaganda_semeval23/fr_dev_subtask3.json", + "train": "data/factuality_disinformation_harmful_content/propaganda_semeval23/fr_train_subtask3.json", + }, + "it": { + "dev": "data/factuality_disinformation_harmful_content/propaganda_semeval23/it_dev_subtask3.json", + "train": "data/factuality_disinformation_harmful_content/propaganda_semeval23/it_train_subtask3.json", + }, + "pl": { + "dev": "data/factuality_disinformation_harmful_content/propaganda_semeval23/po_dev_subtask3.json", + "train": "data/factuality_disinformation_harmful_content/propaganda_semeval23/po_train_subtask3.json", + }, + "ru": { + "dev": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ru_dev_subtask3.json", + "train": "data/factuality_disinformation_harmful_content/propaganda_semeval23/ru_train_subtask3.json", + }, + }, + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "Appeal_to_Authority", + "Appeal_to_Fear-Prejudice", + "Appeal_to_Hypocrisy", + "Appeal_to_Popularity", + "Appeal_to_Time", + "Appeal_to_Values", + "Causal_Oversimplification", + "Consequential_Oversimplification", + "Conversation_Killer", + "Doubt", + "Exaggeration-Minimisation", + "False_Dilemma-No_Choice", + "Flag_Waving", + "Guilt_by_Association", + "Loaded_Language", + "Name_Calling-Labeling", + "Obfuscation-Vagueness-Confusion", + "Questioning_the_Reputation", + "Red_Herring", + "Repetition", + "Slogans", + "Straw_Man", + "Whataboutism", + "no_technique", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "text", "label": ["no_technique"], "line_number": 0} def get_predefined_techniques(self): diff --git a/llmebench/datasets/Spam.py b/llmebench/datasets/Spam.py index 6e5147ea..5f2bf68c 100644 --- a/llmebench/datasets/Spam.py +++ b/llmebench/datasets/Spam.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class SpamDataset(DatasetBase): def __init__(self, **kwargs): super(SpamDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -16,13 +18,18 @@ def metadata(): year={2020}, organization={Springer} }""", + "link": "https://alt.qcri.org/resources/SpamArabicTwitter.tgz", + "license": "Research Purpose Only", + "splits": {"test": "data/sentiment_emotion_others/spam/ArabicAds-test.txt"}, + "task_type": TaskType.Classification, + "class_labels": ["__label__ADS", "__label__NOTADS"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "أختر قلباً وليسّ شكلاً..", "label": "__label__NOTADS"} def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator # Format: spam_label \t text data = [] with open(data_path, "r") as fp: diff --git a/llmebench/datasets/TyDiQA.py b/llmebench/datasets/TyDiQA.py index 711196da..2fa81ae6 100644 --- a/llmebench/datasets/TyDiQA.py +++ b/llmebench/datasets/TyDiQA.py @@ -1,12 +1,14 @@ import json from llmebench.datasets.SQuADBase import SQuADBase +from llmebench.tasks import TaskType class TyDiQADataset(SQuADBase): def __init__(self, **kwargs): super(TyDiQADataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -16,4 +18,11 @@ def metadata(): year = {2020}, journal = {Transactions of the Association for Computational Linguistics} }""", + "link": "https://github.com/google-research-datasets/tydiqa", + "license": "Apache License Version 2.0", + "splits": { + "dev": "data/QA/tydiqa/tydiqa-goldp-dev-arabic.json", + "train": "data/QA/ARCD/arcd-train.json", + }, + "task_type": TaskType.QuestionAnswering, } diff --git a/llmebench/datasets/UnifiedFCFactuality.py b/llmebench/datasets/UnifiedFCFactuality.py index 672c4907..81ff1a6a 100644 --- a/llmebench/datasets/UnifiedFCFactuality.py +++ b/llmebench/datasets/UnifiedFCFactuality.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class UnifiedFCFactualityDataset(DatasetBase): def __init__(self, **kwargs): super(UnifiedFCFactualityDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -19,9 +21,18 @@ def metadata(): booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)", year = "2018", }""", + "link": "https://alt.qcri.org/resources/arabic-fact-checking-and-stance-detection-corpus/", + "license": "Research Purpose Only", + "splits": { + "test": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_fact_checking.tsv", + "train": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/claim/train.csv", + }, + "task_type": TaskType.Classification, + "class_labels": ["true", "false"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "الجملة الاولى", "label": "agree", "input_id": "id"} def load_data(self, data_path): diff --git a/llmebench/datasets/UnifiedFCStance.py b/llmebench/datasets/UnifiedFCStance.py index 0e478261..549852d8 100644 --- a/llmebench/datasets/UnifiedFCStance.py +++ b/llmebench/datasets/UnifiedFCStance.py @@ -1,12 +1,14 @@ import json from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class UnifiedFCStanceDataset(DatasetBase): def __init__(self, **kwargs): super(UnifiedFCStanceDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -21,9 +23,18 @@ def metadata(): booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)", year = "2018", }""", + "link": "https://alt.qcri.org/resources/arabic-fact-checking-and-stance-detection-corpus/", + "license": "Research Purpose Only", + "splits": { + "test": "data/factuality_disinformation_harmful_content/factuality_stance_ramy/ramy_arabic_stance.jsonl", + "train": "data/factuality_disinformation_harmful_content/factuality_stance_khouja/stance/train.csv", + }, + "task_type": TaskType.Classification, + "class_labels": ["agree", "disagree", "discuss", "unrelated"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": { # Train samples diff --git a/llmebench/datasets/WANLP22Propaganda.py b/llmebench/datasets/WANLP22Propaganda.py index 31cb24ac..2e65ea83 100644 --- a/llmebench/datasets/WANLP22Propaganda.py +++ b/llmebench/datasets/WANLP22Propaganda.py @@ -3,6 +3,7 @@ from pathlib import Path from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class WANLP22PropagandaDataset(DatasetBase): @@ -11,6 +12,7 @@ def __init__(self, techniques_path=None, **kwargs): self.techniques_path = Path(techniques_path) if techniques_path else None super(WANLP22PropagandaDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -22,9 +24,38 @@ def metadata(): year={2022}, organization={Association for Computational Linguistics} }""", + "link": "https://gitlab.com/araieval/propaganda-detection", + "license": "Research Purpose Only", + "splits": { + "test": "data/factuality_disinformation_harmful_content/propaganda/task1_test_gold_label_final.json", + "train": "data/factuality_disinformation_harmful_content/propaganda/task1_train.json", + }, + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "no technique", + "Smears", + "Exaggeration/Minimisation", + "Loaded Language", + "Appeal to fear/prejudice", + "Name calling/Labeling", + "Slogans", + "Repetition", + "Doubt", + "Obfuscation, Intentional vagueness, Confusion", + "Flag-waving", + "Glittering generalities (Virtue)", + "Misrepresentation of Someone's Position (Straw Man)", + "Presenting Irrelevant Data (Red Herring)", + "Appeal to authority", + "Whataboutism", + "Black-and-white Fallacy/Dictatorship", + "Thought-terminating cliché", + "Causal Oversimplification", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Tweet", "label": ["no technique"]} def get_predefined_techniques(self): diff --git a/llmebench/datasets/WikiNewsDiacritization.py b/llmebench/datasets/WikiNewsDiacritization.py index 04bf7801..79b408ff 100644 --- a/llmebench/datasets/WikiNewsDiacritization.py +++ b/llmebench/datasets/WikiNewsDiacritization.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class WikiNewsDiacritizationDataset(DatasetBase): def __init__(self, **kwargs): super(WikiNewsDiacritizationDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -22,9 +24,17 @@ def metadata(): doi = "10.18653/v1/W17-1302", pages = "9--17", }""", + "link": "https://github.com/kdarwish/Farasa/tree/master", + "license": "Research Purpose Only", + "splits": { + "test": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruth.txt", + "train": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruthDev.txt", + }, + "task_type": TaskType.Other, } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "Original sentence", "label": "Sentence with diacritized words", diff --git a/llmebench/datasets/WikiNewsLemmatization.py b/llmebench/datasets/WikiNewsLemmatization.py index 94a9ba0d..916ebdb3 100644 --- a/llmebench/datasets/WikiNewsLemmatization.py +++ b/llmebench/datasets/WikiNewsLemmatization.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class WikiNewsLemmatizationDataset(DatasetBase): def __init__(self, **kwargs): super(WikiNewsLemmatizationDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -18,9 +20,16 @@ def metadata(): publisher = "European Language Resources Association (ELRA)", url = "https://aclanthology.org/L18-1181", }""", + "link": "http://alt.qcri.org/~hmubarak/WikiNews-26-06-2015-RefLemma.xlsx", + "license": "Research Purpose Only", + "splits": { + "test": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt" + }, + "task_type": TaskType.Other, } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "جوائز", "label": ("جوائز", "جائزة"), diff --git a/llmebench/datasets/WikiNewsPOS.py b/llmebench/datasets/WikiNewsPOS.py index c4bc3b94..dec64539 100644 --- a/llmebench/datasets/WikiNewsPOS.py +++ b/llmebench/datasets/WikiNewsPOS.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class WikiNewsPOSDataset(DatasetBase): def __init__(self, **kwargs): super(WikiNewsPOSDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -15,9 +17,46 @@ def metadata(): pages={130--137}, year={2017} }""", + "link": "https://github.com/kdarwish/Farasa/blob/master/WikiNews.pos.ref", + "license": "Research Purpose Only", + "splits": { + "test": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruth.txt.POS.tab", + "train": "data/sequence_tagging_ner_pos_etc/POS/WikiNewsTruthDev.txt", + }, + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "ABBREV", + "ADJ", + "ADJ/CONJ", + "ADJ/DET", + "ADJ/NUM", + "ADV", + "CASE", + "CONJ", + "DET", + "FOREIGN", + "FUT_PART", + "NOUN", + "NOUN/DET", + "NSUFF", + "NSUFF/ADJ", + "NSUFF/DET", + "NSUFF/NOUN", + "NUM", + "PART", + "PART/CONJ", + "PART/NOUN", + "PART/PART", + "PART/PREP", + "PREP", + "PRON", + "PUNC", + "V", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "Original sentence", "label": "Sentence with POS tags", diff --git a/llmebench/datasets/WikiNewsSegmentation.py b/llmebench/datasets/WikiNewsSegmentation.py index 639615d4..1c6fa97e 100644 --- a/llmebench/datasets/WikiNewsSegmentation.py +++ b/llmebench/datasets/WikiNewsSegmentation.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class WikiNewsSegmentationDataset(DatasetBase): def __init__(self, **kwargs): super(WikiNewsSegmentationDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -15,16 +17,23 @@ def metadata(): pages={1070--1074}, year={2016} }""", + "link": "https://github.com/kdarwish/Farasa/blob/master/WikiNews.pos.ref", + "license": "Research Purpose Only", + "splits": { + "test": "data/sequence_tagging_ner_pos_etc/segmentation/WikiNewsTruth.txt", + "train": "data/sequence_tagging_ner_pos_etc/segmentation/WikiNewsTruthDev.txt", + }, + "task_type": TaskType.Other, } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "Original sentence", "label": "Sentence with segmented words", } def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator data = [] with open(data_path, "r") as fp: diff --git a/llmebench/datasets/XGLUEPOS.py b/llmebench/datasets/XGLUEPOS.py index b90c9821..b2e12385 100644 --- a/llmebench/datasets/XGLUEPOS.py +++ b/llmebench/datasets/XGLUEPOS.py @@ -1,10 +1,12 @@ from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class XGLUEPOSDataset(DatasetBase): def __init__(self, **kwargs): super(XGLUEPOSDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -15,9 +17,35 @@ def metadata(): pages={6008--6018}, year={2020} }""", + "link": "https://microsoft.github.io/XGLUE/", + "license": "Non-commercial research purposes only", + "splits": { + "dev": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.dev.src-trg.txt", + "test": "data/sequence_tagging_ner_pos_etc/POS/XGLUE/ar.test.src-trg.txt", + }, + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "ADJ", + "ADP", + "ADV", + "AUX", + "CCONJ", + "DET", + "INTJ", + "NOUN", + "NUM", + "PART", + "PRON", + "PROPN", + "PUNCT", + "SYM", + "VERB", + "X", + ], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return { "input": "Original sentence", "label": "Sentence with POS tags", diff --git a/llmebench/datasets/XNLI.py b/llmebench/datasets/XNLI.py index 8ab4afd5..37988a62 100644 --- a/llmebench/datasets/XNLI.py +++ b/llmebench/datasets/XNLI.py @@ -1,12 +1,14 @@ import pandas as pd from llmebench.datasets.dataset_base import DatasetBase +from llmebench.tasks import TaskType class XNLIDataset(DatasetBase): def __init__(self, **kwargs): super(XNLIDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -25,9 +27,18 @@ def metadata(): publisher = "Association for Computational Linguistics", location = "Brussels, Belgium", }""", + "link": "https://github.com/facebookresearch/XNLI", + "license": "CC BY-NC 4.0", + "splits": { + "dev": "data/XNLI/xnli.dev.tsv", + "test": "data/XNLI/xnli.test.ar.tsv", + }, + "task_type": TaskType.Classification, + "class_labels": ["contradiction", "entailment", "neutral"], } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "Test\tTest", "label": "neutral"} def load_data(self, data_path): diff --git a/llmebench/datasets/XQuAD.py b/llmebench/datasets/XQuAD.py index 17a936eb..20257d87 100644 --- a/llmebench/datasets/XQuAD.py +++ b/llmebench/datasets/XQuAD.py @@ -1,12 +1,14 @@ import json from llmebench.datasets.SQuADBase import SQuADBase +from llmebench.tasks import TaskType class XQuADDataset(SQuADBase): def __init__(self, **kwargs): super(XQuADDataset, self).__init__(**kwargs) + @staticmethod def metadata(): return { "language": "ar", @@ -17,4 +19,11 @@ def metadata(): pages={4623--4637}, year={2020} }""", + "link": "https://github.com/google-deepmind/xquad", + "license": "CC-BY-SA4.0", + "splits": { + "test": "data/QA/xquad/xquad.ar.json", + "train": "data/QA/ARCD/arcd-train.json", + }, + "task_type": TaskType.QuestionAnswering, } diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py index 9131f301..acd7c879 100644 --- a/llmebench/datasets/__init__.py +++ b/llmebench/datasets/__init__.py @@ -1,13 +1,17 @@ from .ADI import ADIDataset from .Adult import AdultDataset from .ANERcorp import ANERcorpDataset +from .ANSFactuality import ANSFactualityDataset +from .ANSStance import ANSStanceDataset from .Aqmar import AqmarDataset from .AraBench import AraBenchDataset from .ArabGend import ArabGendDataset from .ArapTweet import ArapTweetDataset from .ARCD import ARCDDataset from .ArSarcasm import ArSarcasmDataset +from .ArSarcasm2 import ArSarcasm2Dataset from .ArSAS import ArSASDataset +from .ASND import ASNDDataset from .BanglaSentiment import BanglaSentimentDataset from .BibleMaghrebiDiacritization import BibleMaghrebiDiacritizationDataset from .COVID19Factuality import COVID19FactualityDataset @@ -17,22 +21,19 @@ from .CT22Harmful import CT22HarmfulDataset from .CT23Subjectivity import CT23SubjectivityDataset from .Emotion import EmotionDataset -from .Khouja20Factuality import Khouja20FactualityDataset -from .Khouja20Stance import Khouja20StanceDataset from .Location import LocationDataset from .MGBWords import MGBWordsDataset from .MLQA import MLQADataset from .NameInfo import NameInfoDataset -from .NewsCatAkhbarona import NewsCatAkhbaronaDataset -from .NewsCatAlArabiya import NewsCatAlArabiyaDataset -from .NewsCatAlKhaleej import NewsCatAlKhaleejDataset -from .NewsCatASND import NewsCatASNDDataset from .OSACT4SubtaskA import OSACT4SubtaskADataset from .OSACT4SubtaskB import OSACT4SubtaskBDataset from .PADT import PADTDataset from .QADI import QADIDataset from .QCRIDialectalArabicPOS import QCRIDialectalArabicPOSDataset from .QCRIDialectalArabicSegmentation import QCRIDialectalArabicSegmentationDataset +from .SANADAkhbarona import SANADAkhbaronaDataset +from .SANADAlArabiya import SANADAlArabiyaDataset +from .SANADAlKhaleej import SANADAlKhaleejDataset from .SemEval17T1STS import SemEval17T1STSDataset from .SemEval17T2STS import SemEval17T2STSDataset from .SemEval23T3Propaganda import SemEval23T3PropagandaDataset diff --git a/llmebench/datasets/dataset_base.py b/llmebench/datasets/dataset_base.py index a95ff52e..c59843f9 100644 --- a/llmebench/datasets/dataset_base.py +++ b/llmebench/datasets/dataset_base.py @@ -52,11 +52,12 @@ class DatasetBase(ABC): """ - def __init__(self, data_dir="data", **kwargs): - self.data_dir = data_dir + def __init__(self, **kwargs): + pass + @staticmethod @abstractmethod - def metadata(self): + def metadata(): """ Returns the dataset's metadata @@ -77,13 +78,38 @@ def metadata(self): "ar" # Single supported language Languages should be identified by their IETF language tags The returned dictionary _can_ have the following additional keys: + "link" : str + Link to the representative page for the dataset + "license" : str + Original license under which the dataset was released + "splits" : dict + A dictionary containing the keys "test", "dev" and "train" + (at least one). "test" will be used automatically for + evaluation, if present. Asset can specify a different split + if necessary. Multiple splits are also supported, by having + a nested dictionary structure, where the first level should + be the split name, and the second level should include the + actual "test"/"dev"/"train" splits. A special "default" split + can also be included, whose value must be a list of split + names that will be run by default. + "task_type" : llmebench.tasks.TaskType + The type of task this dataset targets. Used by the Random + Model. + "class_labels" : list (optional) + List of class labels, must be provided when `task_type` is + `Classification`, `MultiLabelClassification` or + `SequenceLabeling`. + "score_range" : tuple (optional) + Score range defining (min_val, max_val). Must be defined + when `task_type` is `Regression` "download_url" : str (optional) URL to data (for automatic downloads) """ pass + @staticmethod @abstractmethod - def get_data_sample(self): + def get_data_sample(): """ Returns a single data sample. @@ -290,7 +316,8 @@ def prepare_fewshots(self, target_data, train_data, n_shots, deduplicate=True): yield examples - def download_dataset(self, download_url=None): + @classmethod + def download_dataset(cls, data_dir, download_url=None): """ Utility method to download a dataset if not present locally on disk. Can handle datasets of types *.zip, *.tar, *.tar.gz, *.tar.bz2, *.tar.xz. @@ -334,7 +361,7 @@ def decompress(fname, action, pup): # Default where the downloaded file is not a container/archive fnames = [fname] - extract_dir = self.__class__.__name__ + extract_dir = cls.__name__ if fname.endswith(".tar.xz"): extractor = Decompress(name=fname[:-3]) @@ -383,7 +410,7 @@ def decompress(fname, action, pup): if download_url is not None: download_urls.append(download_url) - metadata_url = self.metadata().get("download_url", None) + metadata_url = cls.metadata().get("download_url", None) if metadata_url is not None: download_urls.append(metadata_url) @@ -391,7 +418,7 @@ def decompress(fname, action, pup): if default_url is not None: if default_url.endswith("/"): default_url = default_url[:-1] - default_url = f"{default_url}/{self.__class__.__name__}.zip" + default_url = f"{default_url}/{cls.__name__}.zip" download_urls.append(default_url) # Try downloading from available links in order of priority @@ -417,17 +444,15 @@ def decompress(fname, action, pup): retrieve( download_url, known_hash=None, - fname=f"{self.__class__.__name__}{extension}", - path=self.data_dir, + fname=f"{cls.__name__}{extension}", + path=data_dir, progressbar=True, processor=decompress, ) # If it was a *.tar.* file, we can safely delete the # intermediate *.tar file if extension in supported_extensions[:3]: - tar_file_path = ( - Path(self.data_dir) / f"{self.__class__.__name__}.tar" - ) + tar_file_path = Path(data_dir) / f"{cls.__name__}.tar" tar_file_path.unlink() return True except Exception as e: diff --git a/llmebench/tasks/__init__.py b/llmebench/tasks/__init__.py index 29997b19..0510eab6 100644 --- a/llmebench/tasks/__init__.py +++ b/llmebench/tasks/__init__.py @@ -1,3 +1,5 @@ +from enum import Enum + from .Adult import AdultTask from .ArabicDiacritization import ArabicDiacritizationTask from .ArabicParsing import ArabicParsingTask @@ -29,3 +31,16 @@ from .STS import STSTask from .Subjectivity import SubjectivityTask from .XNLI import XNLITask + +TaskType = Enum( + "TaskType", + [ + "Classification", + "MultiLabelClassification", + "SequenceLabeling", + "QuestionAnswering", + "SequenceToSequence", + "Regression", + "Other", + ], +) diff --git a/llmebench/utils.py b/llmebench/utils.py index 3ee2130d..1595581a 100644 --- a/llmebench/utils.py +++ b/llmebench/utils.py @@ -1,5 +1,7 @@ import importlib.util import sys + +from inspect import signature from pathlib import Path from typing import TYPE_CHECKING @@ -34,3 +36,85 @@ def import_source_file(fname: Path, modname: str) -> "types.ModuleType": except FileNotFoundError as e: raise ImportError(f"{e.strerror}: {fname}") from e return module + + +def is_fewshot_asset(config, prompt_fn): + """Detect if a given asset is zero shot or few show""" + sig = signature(prompt_fn) + general_args = config.get("general_args", {}) + return "fewshot" in general_args or len(sig.parameters) == 2 + + +def get_data_paths(config, split): + """Given a asset config, return the appropriate data paths""" + assert split in ["train", "test"] + + dataset_args = config.get("dataset_args", {}) + dataset = config["dataset"](**dataset_args) + + if split == "test": + data_args = config.get("general_args", {}) + elif split == "train": + general_args = config.get("general_args", {}) + data_args = general_args.get("fewshot", {}) + + data_paths = [] + if f"custom_{split}_split" in data_args: + data_paths.append(("custom", data_args[f"custom_{split}_split"])) + elif f"{split}_split" in data_args: + requested_splits = data_args[f"{split}_split"] + if not isinstance(requested_splits, list): + requested_splits = [requested_splits] + requested_splits = [rs.split("/") for rs in requested_splits] + available_splits = dataset.metadata()["splits"] + + for requested_split in requested_splits: + if len(requested_split) == 1: + # Single level split like "test" or "ar" + assert ( + requested_split[0] in available_splits + ), "Requested split not found in dataset" + if split in available_splits[requested_split[0]]: + # Pick "test"/"train" automatically, if available + data_paths.append( + ( + requested_split[0], + available_splits[requested_split[0]][split], + ) + ) + else: + data_paths.append( + (requested_split[0], available_splits[requested_split[0]]) + ) + else: + # Multilevel split like "ar" -> "test" + assert ( + requested_split[0] in available_splits + ), "Requested split not found in dataset" + assert ( + requested_split[1] in available_splits[requested_split[0]] + ), "Requested split not found in dataset" + data_paths.append( + ( + f"{requested_split[0]}/{requested_split[1]}", + available_splits[requested_split[0]][requested_split[1]], + ) + ) + else: + # Use default splits + available_splits = dataset.metadata()["splits"] + if "default" in available_splits: + # Multilevel splits + for av_split in available_splits["default"]: + assert ( + split in available_splits[av_split] + ), f'No "{split}" split found in dataset, please specify split explicitly. Available splits are: {", ".join(available_splits[av_split])}' + data_paths.append((av_split, available_splits[av_split][split])) + else: + # Single level splits + assert ( + split in available_splits + ), f'No "{split}" split found in dataset, please specify split explicitly. Available splits are: {", ".join(available_splits)}' + data_paths.append((split, available_splits[split])) + + return data_paths diff --git a/scripts/find_dataset_metadata.py b/scripts/find_dataset_metadata.py new file mode 100644 index 00000000..31f1a758 --- /dev/null +++ b/scripts/find_dataset_metadata.py @@ -0,0 +1,336 @@ +from collections import defaultdict + +from llmebench import Benchmark + +from pprint import pprint + +import json + +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def main(): + benchmark = Benchmark(benchmark_dir="assets") + + assets = benchmark.find_assets() + + train_dataset_metadata = defaultdict(set) + test_dataset_metadata = defaultdict(set) + class_labels_metadata = defaultdict(set) + + for asset in assets: + configs = asset["module"].config() + if isinstance(configs, dict): + configs = [{"name": "dummy", "config": configs}] + + asset_name = asset["name"] + for zap in ["_BLOOMZ", "_GPT4", "_GPT35", "_ZeroShot", "_FewShot", "_mdeberta_v3_base_squad2", "_Intfloat_Multilingual_e5_small", "_Camelbert_da_sentiment"]: + asset_name = asset_name.replace(zap, "") + + for c in configs: + config = c["config"] + dataset_name = config["dataset"].__name__ + + data_path = config["general_args"]["data_path"] + if isinstance(data_path, dict): + if "split" in data_path: + assert data_path["split"] == "test" + data_path = data_path["path"] + else: + data_path = data_path["sentences_path"] + + train_data_path = None + if "fewshot" in config["general_args"]: + train_data_path = config["general_args"]["fewshot"]["train_data_path"] + if isinstance(train_data_path, dict): + if "split" in train_data_path: + assert train_data_path["split"] == "train" or train_data_path["split"] == "dev" + train_data_path = train_data_path["path"] + else: + train_data_path = train_data_path["sentences_path"] + + test_dataset_metadata[dataset_name].add((data_path, asset_name)) + if train_data_path: + train_dataset_metadata[dataset_name].add((train_data_path, asset_name)) + + + if "class_labels" in config["model_args"]: + # Ignore language + class_labels_metadata[dataset_name].add((tuple(config["model_args"]["class_labels"]), asset_name[3:])) + + # print("Test data paths") + for dataset in sorted(test_dataset_metadata): + print("================================================") + print(dataset) + obj = {} + if dataset == "SemEval23T3PropagandaDataset": + obj = {} + mapping = { + "en": "en", + "fr": "fr", + "ge": "de", + "it": "it", + "po": "pl", + "ru": "ru" + } + exceptions = [] + for xlang in mapping: + lang = mapping[xlang] + try: + obj[lang] = {} + test_path = [p for p, _ in test_dataset_metadata[dataset] if f"/{xlang}_" in p][0] + if "dev" in test_path: + obj[lang]["dev"] = test_path + else: + obj[lang]["test"] = test_path + + train_path = [p for p, _ in train_dataset_metadata[dataset] if f"/{xlang}_" in p][0] + if "dev" in train_path: + assert "dev" not in obj + obj[lang]["dev"] = train_path + else: + obj[lang]["train"] = train_path + except: + exceptions.append(xlang) + + for xlang in exceptions: + if xlang == "ar": + obj["ar"] = { + "dev": obj["en"]["dev"][:obj["en"]["dev"].rfind("/")] + "/ar_dev_subtask3.json", + "train": obj["en"]["train"][:obj["en"]["train"].rfind("/")] + "/ar_train_subtask3.json" + } + else: + raise Exception() + elif dataset == "CT22CheckworthinessDataset": + obj = {} + mapping = {'dutch': "nl", + 'arabic': "ar", + 'spanish': "es", + 'bulgarian': "bg", + 'turkish': "tr", + 'english': "en" + } + exceptions = [] + for xlang in mapping: + lang = mapping[xlang] + obj[lang] = {} + try: + test_path = [p for p, _ in test_dataset_metadata[dataset] if f"/{xlang}/" in p][0] + if "dev" in test_path: + obj[lang]["dev"] = test_path + else: + obj[lang]["test"] = test_path + + train_path = [p for p, _ in train_dataset_metadata[dataset] if f"/{xlang}/" in p][0] + if "dev" in train_path: + assert "dev" not in obj + obj[lang]["dev"] = train_path + else: + obj[lang]["train"] = train_path + except: + exceptions.append(xlang) + assert len(exceptions) == 0 + elif dataset == "AraBenchDataset": + obj = { + "APT-LEV": { + "train": ("data/MT/ldc_web_lv.train.lev.0.lv.ar", "data/MT/ldc_web_lv.train.lev.0.lv.en"), + "dev": ("data/MT/ldc_web_lv.dev.lev.0.lv.ar", "data/MT/ldc_web_lv.dev.lev.0.lv.en"), + "test": ("data/MT/ldc_web_lv.test.lev.0.lv.ar", "data/MT/ldc_web_lv.test.lev.0.lv.en") + }, + "APT-Nile": { + "train": ("data/MT/ldc_web_eg.train.nil.0.eg.ar", "data/MT/ldc_web_eg.train.nil.0.eg.en"), + "dev": ("data/MT/ldc_web_eg.dev.nil.0.eg.ar", "data/MT/ldc_web_eg.dev.nil.0.eg.en"), + "test": ("data/MT/ldc_web_eg.test.nil.0.eg.ar", "data/MT/ldc_web_eg.test.nil.0.eg.en") + }, + "MADAR-Gulf" : { + "train": ("data/MT/madar.train.glf.0.qa.ar", "data/MT/madar.train.glf.0.qa.en"), + "dev": ("data/MT/madar.dev.glf.0.qa.ar", "data/MT/madar.dev.glf.0.qa.en"), + "test": [ + ("data/MT/madar.test.glf.0.iq.ar", "data/MT/madar.test.glf.0.iq.en"), + ("data/MT/madar.test.glf.1.iq.ar", "data/MT/madar.test.glf.1.iq.en"), + ("data/MT/madar.test.glf.2.iq.ar", "data/MT/madar.test.glf.2.iq.en"), + ("data/MT/madar.test.glf.0.om.ar", "data/MT/madar.test.glf.0.om.en"), + ("data/MT/madar.test.glf.0.qa.ar", "data/MT/madar.test.glf.0.qa.en"), + ("data/MT/madar.test.glf.0.sa.ar", "data/MT/madar.test.glf.0.sa.en"), + ("data/MT/madar.test.glf.1.sa.ar", "data/MT/madar.test.glf.1.sa.en"), + ("data/MT/madar.test.glf.0.ye.ar", "data/MT/madar.test.glf.0.ye.en") + ] + }, + "MADAR-LEV" : { + "train": ("data/MT/madar.train.lev.0.lb.ar", "data/MT/madar.train.lev.0.lb.en"), + "dev": ("data/MT/madar.dev.lev.0.lb.ar", "data/MT/madar.dev.lev.0.lb.en"), + "test": [ + ("data/MT/madar.test.lev.0.jo.ar", "data/MT/madar.test.lev.0.jo.en"), + ("data/MT/madar.test.lev.1.jo.ar", "data/MT/madar.test.lev.1.jo.en"), + ("data/MT/madar.test.lev.0.lb.ar", "data/MT/madar.test.lev.0.lb.en"), + ("data/MT/madar.test.lev.0.pa.ar", "data/MT/madar.test.lev.0.pa.en"), + ("data/MT/madar.test.lev.0.sy.ar", "data/MT/madar.test.lev.0.sy.en"), + ("data/MT/madar.test.lev.1.sy.ar", "data/MT/madar.test.lev.1.sy.en") + ] + }, + "MADAR-MGR" : { + "train": [ + ("data/MT/madar.train.mgr.0.ma.ar", "data/MT/madar.train.mgr.0.ma.en"), + ("data/MT/madar.train.mgr.0.tn.ar", "data/MT/madar.train.mgr.0.tn.en") + ], + "dev": ("data/MT/madar.dev.mgr.0.ma.ar", "data/MT/madar.dev.mgr.0.ma.en"), + "test": [ + ("data/MT/madar.test.mgr.0.dz.ar", "data/MT/madar.test.mgr.0.dz.en"), + ("data/MT/madar.test.mgr.0.ly.ar", "data/MT/madar.test.mgr.0.ly.en"), + ("data/MT/madar.test.mgr.1.ly.ar", "data/MT/madar.test.mgr.1.ly.en"), + ("data/MT/madar.test.mgr.0.ma.ar", "data/MT/madar.test.mgr.0.ma.en"), + ("data/MT/madar.test.mgr.1.ma.ar", "data/MT/madar.test.mgr.1.ma.en"), + ("data/MT/madar.test.mgr.0.tn.ar", "data/MT/madar.test.mgr.0.tn.en"), + ("data/MT/madar.test.mgr.1.tn.ar", "data/MT/madar.test.mgr.1.tn.en") + ] + }, + "MADAR-MSA" : { + "train": ("data/MT/madar.train.msa.0.ms.ar", "data/MT/madar.train.msa.0.ms.en"), + "dev": ("data/MT/madar.dev.msa.0.ms.ar", "data/MT/madar.dev.msa.0.ms.en"), + "test": ("data/MT/madar.test.msa.0.ms.ar", "data/MT/madar.test.msa.0.ms.en") + }, + "MADAR-Nile" : { + "train": ("data/MT/madar.train.nil.0.eg.ar", "data/MT/madar.train.nil.0.eg.en"), + "dev": ("data/MT/madar.dev.nil.0.eg.ar", "data/MT/madar.dev.nil.0.eg.en"), + "test": [ + ("data/MT/madar.test.nil.0.eg.ar", "data/MT/madar.test.nil.0.eg.en"), + ("data/MT/madar.test.nil.1.eg.ar", "data/MT/madar.test.nil.1.eg.en"), + ("data/MT/madar.test.nil.2.eg.ar", "data/MT/madar.test.nil.2.eg.en"), + ("data/MT/madar.test.nil.0.sd.ar", "data/MT/madar.test.nil.0.sd.en") + ] + }, + "MDC-LEV" : { + "dev": ("data/MT/ldc_web_eg.dev.lev.0.sy.ar", "data/MT/ldc_web_eg.dev.lev.0.sy.en"), + "test": [ + ("data/MT/ldc_web_eg.test.lev.0.jo.ar", "data/MT/ldc_web_eg.test.lev.0.jo.en"), + ("data/MT/ldc_web_eg.test.lev.0.ps.ar", "data/MT/ldc_web_eg.test.lev.0.ps.en"), + ("data/MT/ldc_web_eg.test.lev.0.sy.ar", "data/MT/ldc_web_eg.test.lev.0.sy.en"), + ] + }, + "MDC-MGR" : { + "test": ("data/MT/ldc_web_eg.test.mgr.0.tn.ar", "data/MT/ldc_web_eg.test.mgr.0.tn.en"), + }, + "MDC-MSA" : { + "test": ("data/MT/ldc_web_eg.test.msa.0.ms.ar", "data/MT/ldc_web_eg.test.msa.0.ms.en"), + }, + "Media-Gulf" : { + "test": ("data/MT/summa-Oman.test.glf.0.om.ar", "data/MT/summa-Oman.test.glf.0.om.en") + }, + "Media-LEV" : { + "test": ("data/MT/summa-LBC.test.lev.0.lb.ar", "data/MT/summa-LBC.test.lev.0.lb.en") + }, + "Media-MGR" : { + "test": ("data/MT/summa-2M.test.mgr.0.ma.ar", "data/MT/summa-2M.test.mgr.0.ma.en") + }, + "Media-MSA" : { + "test": [ + ("data/MT/summa-AJ.test.msa.0.ms.ar", "data/MT/summa-AJ.test.msa.0.ms.en"), + ("data/MT/summa-BBC.test.msa.0.ms.ar", "data/MT/summa-BBC.test.msa.0.ms.en") + ] + }, + "QAraC-Gulf" : { + "dev": ("data/MT/QAraC.dev.glf.0.qa.ar", "data/MT/QAraC.dev.glf.0.qa.ar"), + "test": ("data/MT/QAraC.test.glf.0.qa.ar", "data/MT/QAraC.test.glf.0.qa.en") + }, + "Bible-MGR" : { + "train": [ + ("data/MT/bible.train.mgr.0.ma.ar", "data/MT/bible.train.mgr.0.ma.en"), + ("data/MT/bible.train.mgr.0.tn.ar", "data/MT/bible.train.mgr.0.tn.en"), + ], + "dev": [ + ("data/MT/bible.dev.mgr.0.ma.ar", "data/MT/bible.dev.mgr.0.ma.en"), + ("data/MT/bible.dev.mgr.0.tn.ar", "data/MT/bible.dev.mgr.0.tn.en"), + ], + "test": [ + ("data/MT/bible.test.mgr.0.ma.ar", "data/MT/bible.test.mgr.0.ma.en"), + ("data/MT/bible.test.mgr.0.tn.ar", "data/MT/bible.test.mgr.0.tn.en"), + ] + }, + "Bible-MSA" : { + "train": [ + ("data/MT/bible.train.msa.0.ms.ar", "data/MT/bible.train.msa.0.ms.en"), + ("data/MT/bible.train.msa.1.ms.ar", "data/MT/bible.train.msa.1.ms.en"), + ], + "dev": [ + ("data/MT/bible.dev.msa.0.ms.ar", "data/MT/bible.dev.msa.0.ms.en"), + ("data/MT/bible.dev.msa.1.ms.ar", "data/MT/bible.dev.msa.1.ms.en") + ], + "test": [ + ("data/MT/bible.test.msa.0.ms.ar", "data/MT/bible.test.msa.0.ms.en"), + ("data/MT/bible.test.msa.1.ms.ar", "data/MT/bible.test.msa.1.ms.en") + ] + }, + "default": ["APT-LEV", "APT-Nile", "MADAR-Gulf", "MADAR-LEV", "MADAR-MGR", "MADAR-MSA", "MADAR-Nile", "MDC-LEV", "MDC-MGR", "MDC-MSA", "Bible-MGR", "Bible-MSA"] + } + elif dataset == "QCRIDialectalArabicSegmentationDataset": + obj = { + "test": [ + "data/sequence_tagging_ner_pos_etc/segmentation/glf.seg/glf.data_5.test.src.sent", + "data/sequence_tagging_ner_pos_etc/segmentation/lev.seg/lev.data_5.test.src.sent", + "data/sequence_tagging_ner_pos_etc/segmentation/egy.seg/egy.data_5.test.src.sent", + "data/sequence_tagging_ner_pos_etc/segmentation/mgr.seg/mgr.data_5.test.src.sent" + ] + } + elif dataset == "BibleMaghrebiDiacritizationDataset": + obj = { + "test": [ + "data/sequence_tagging_ner_pos_etc/diacritization/morrocan_f05.test.src-tgt.txt", + "data/sequence_tagging_ner_pos_etc/diacritization/tunisian_f05.test.src-tgt.txt" + ] + } + elif dataset == "QCRIDialectalArabicPOSDataset": + obj = { + "dev": [ + "data/sequence_tagging_ner_pos_etc/POS/egy.pos/egy.data_5.dev.src-trg.sent", + "data/sequence_tagging_ner_pos_etc/POS/glf.pos/glf.data_5.dev.src-trg.sent", + "data/sequence_tagging_ner_pos_etc/POS/mgr.pos/mgr.data_5.dev.src-trg.sent", + "data/sequence_tagging_ner_pos_etc/POS/lev.pos/lev.data_5.dev.src-trg.sent" + ], + "test": [ + "data/sequence_tagging_ner_pos_etc/POS/egy.pos/egy.data_5.test.src-trg.sent", + "data/sequence_tagging_ner_pos_etc/POS/glf.pos/glf.data_5.test.src-trg.sent", + "data/sequence_tagging_ner_pos_etc/POS/mgr.pos/mgr.data_5.test.src-trg.sent", + "data/sequence_tagging_ner_pos_etc/POS/lev.pos/lev.data_5.test.src-trg.sent" + ] + } + elif len(list(test_dataset_metadata[dataset])) > 1 or len(list(train_dataset_metadata[dataset])) > 1: + for path, source in train_dataset_metadata[dataset]: + print(f"\t{path} ({source})") + for path, source in test_dataset_metadata[dataset]: + print(f"\t{path} ({source})") + else: + test_path = list(test_dataset_metadata[dataset])[0][0] + if "dev" in test_path: + obj["dev"] = test_path + else: + obj["test"] = test_path + + if dataset in train_dataset_metadata and len(train_dataset_metadata[dataset]) > 0: + train_path = list(train_dataset_metadata[dataset])[0][0] + if "dev" in train_path: + assert "dev" not in obj + obj["dev"] = train_path + else: + obj["train"] = train_path + + pprint(obj) + print("") + if len(class_labels_metadata[dataset]) > 1: + print(bcolors.FAIL) + for label_list, asset in class_labels_metadata[dataset]: + print(f"{label_list} ({asset})") + print(bcolors.ENDC) + # print(json.dumps(obj, indent=2)) + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tests/datasets/test_download_and_caching.py b/tests/datasets/test_download_and_caching.py index 3a9e471e..4980c79d 100644 --- a/tests/datasets/test_download_and_caching.py +++ b/tests/datasets/test_download_and_caching.py @@ -24,26 +24,16 @@ def service_actions(self): class MockDataset(DatasetBase): - def metadata(self): + def metadata(): return {} - def get_data_sample(self): + def get_data_sample(): return {"input": "input", "label": "label"} def load_data(self, data_path): return [self.get_data_sample() for _ in range(100)] -class MockDatasetWithDownloadURL(MockDataset): - def __init__(self, port, filename="MockDataset.zip", **kwargs): - self.port = port - self.filename = filename - super(MockDatasetWithDownloadURL, self).__init__(**kwargs) - - def metadata(self): - return {"download_url": f"http://localhost:{self.port}/{self.filename}"} - - class TestDatasetAutoDownload(unittest.TestCase): @classmethod def setUpClass(cls): @@ -81,10 +71,11 @@ def test_auto_download_zip(self): data_dir = TemporaryDirectory() - dataset = MockDataset(data_dir=data_dir.name) + dataset = MockDataset() self.assertTrue( dataset.download_dataset( - download_url=f"http://localhost:{self.port}/MockDataset.zip" + data_dir=data_dir.name, + download_url=f"http://localhost:{self.port}/MockDataset.zip", ) ) @@ -95,10 +86,11 @@ def test_auto_download_tar(self): data_dir = TemporaryDirectory() - dataset = MockDataset(data_dir=data_dir.name) + dataset = MockDataset() self.assertTrue( dataset.download_dataset( - download_url=f"http://localhost:{self.port}/MockDataset.tar" + data_dir=data_dir.name, + download_url=f"http://localhost:{self.port}/MockDataset.tar", ) ) @@ -109,10 +101,11 @@ def test_auto_download_tar_gz(self): data_dir = TemporaryDirectory() - dataset = MockDataset(data_dir=data_dir.name) + dataset = MockDataset() self.assertTrue( dataset.download_dataset( - download_url=f"http://localhost:{self.port}/MockDataset.tar.gz" + data_dir=data_dir.name, + download_url=f"http://localhost:{self.port}/MockDataset.tar.gz", ) ) @@ -123,10 +116,11 @@ def test_auto_download_tar_bz2(self): data_dir = TemporaryDirectory() - dataset = MockDataset(data_dir=data_dir.name) + dataset = MockDataset() self.assertTrue( dataset.download_dataset( - download_url=f"http://localhost:{self.port}/MockDataset.tar.bz2" + data_dir=data_dir.name, + download_url=f"http://localhost:{self.port}/MockDataset.tar.bz2", ) ) @@ -137,10 +131,11 @@ def test_auto_download_tar_xz(self): data_dir = TemporaryDirectory() - dataset = MockDataset(data_dir=data_dir.name) + dataset = MockDataset() self.assertTrue( dataset.download_dataset( - download_url=f"http://localhost:{self.port}/MockDataset.tar.xz" + data_dir=data_dir.name, + download_url=f"http://localhost:{self.port}/MockDataset.tar.xz", ) ) @@ -151,14 +146,14 @@ def test_auto_download_default_url(self): data_dir = TemporaryDirectory() - dataset = MockDataset(data_dir=data_dir.name) + dataset = MockDataset() with patch.dict( "os.environ", { "DEFAULT_DOWNLOAD_URL": f"http://localhost:{self.port}/", }, ): - self.assertTrue(dataset.download_dataset()) + self.assertTrue(dataset.download_dataset(data_dir=data_dir.name)) self.check_downloaded(Path(data_dir.name), "MockDataset", "zip") @@ -173,8 +168,12 @@ def test_auto_download_metadata_url(self): data_dir = TemporaryDirectory() - dataset = MockDatasetWithDownloadURL(data_dir=data_dir.name, port=self.port) - self.assertTrue(dataset.download_dataset()) + class MockDatasetWithDownloadURL(MockDataset): + def metadata(): + return {"download_url": f"http://localhost:{self.port}/MockDataset.zip"} + + dataset = MockDatasetWithDownloadURL() + self.assertTrue(dataset.download_dataset(data_dir=data_dir.name)) self.check_downloaded(Path(data_dir.name), "MockDatasetWithDownloadURL", "zip") @@ -189,12 +188,17 @@ def test_auto_download_non_existent(self): data_dir = TemporaryDirectory() - dataset = MockDatasetWithDownloadURL( - data_dir=data_dir.name, port=self.port, filename="InvalidDataset.zip" - ) + class MockDatasetWithDownloadURL(MockDataset): + def metadata(): + return { + "download_url": f"http://localhost:{self.port}/InvalidDataset.zip" + } + + dataset = MockDatasetWithDownloadURL() self.assertFalse( dataset.download_dataset( - download_url="http://invalid.llmebench-server.org/Dataset.zip" + data_dir=data_dir.name, + download_url="http://invalid.llmebench-server.org/Dataset.zip", ) ) @@ -212,9 +216,10 @@ def test_cache_existing_file(self): # download_dataset should not reach out to the invalid server, # since file is present locally - dataset = MockDataset(data_dir=data_dir.name) + dataset = MockDataset() self.assertTrue( dataset.download_dataset( - download_url="http://invalid.llmebench-server.org/ExistingData.zip" + data_dir=data_dir.name, + download_url="http://invalid.llmebench-server.org/ExistingData.zip", ) ) diff --git a/tests/datasets/test_implementation.py b/tests/datasets/test_implementation.py index 52b36c3c..98357131 100644 --- a/tests/datasets/test_implementation.py +++ b/tests/datasets/test_implementation.py @@ -23,3 +23,21 @@ def test_base_constructor(self): for dataset in self.datasets: with self.subTest(msg=dataset.__name__): base_class_constructor_checker(dataset, self) + + def test_metadata_static(self): + "Test if all datasets mark metadata method as static" + + for dataset in self.datasets: + with self.subTest(msg=dataset.__name__): + method_impl = dataset.__dict__.get("metadata") + if method_impl: + self.assertIsInstance(method_impl, staticmethod) + + def test_get_data_sample_static(self): + "Test if all datasets mark get_data_sample method as static" + + for dataset in self.datasets: + with self.subTest(msg=dataset.__name__): + method_impl = dataset.__dict__.get("get_data_sample") + if method_impl: + self.assertIsInstance(method_impl, staticmethod) diff --git a/tests/datasets/test_metadata.py b/tests/datasets/test_metadata.py index b185c82f..97b19a5c 100644 --- a/tests/datasets/test_metadata.py +++ b/tests/datasets/test_metadata.py @@ -6,6 +6,7 @@ import llmebench.datasets as datasets from langcodes import tag_is_valid +from llmebench.tasks import TaskType class TestDatasetMetadata(unittest.TestCase): @@ -22,13 +23,15 @@ def test_dataset_metadata(self): for dataset in self.datasets: with self.subTest(msg=dataset.__name__): - self.assertIsInstance(dataset.metadata(), dict) - self.assertIn("citation", dataset.metadata()) - self.assertIsInstance(dataset.metadata()["citation"], str) - self.assertIn("language", dataset.metadata()) - self.assertIsInstance(dataset.metadata()["language"], (str, list)) + metadata = dataset.metadata() - languages = dataset.metadata()["language"] + self.assertIsInstance(metadata, dict) + self.assertIn("citation", metadata) + self.assertIsInstance(metadata["citation"], str) + self.assertIn("language", metadata) + self.assertIsInstance(metadata["language"], (str, list)) + + languages = metadata["language"] if isinstance(languages, str): languages = [languages] @@ -37,3 +40,28 @@ def test_dataset_metadata(self): language == "multilingual" or tag_is_valid(language), f"{language} is not a valid language", ) + + self.assertIn("splits", metadata) + for split_name in metadata["splits"]: + self.assertFalse( + "/" in split_name, "Split names cannot contain '/'" + ) + if isinstance(metadata["splits"][split_name], dict): + for sub_split_name in metadata["splits"][split_name]: + self.assertFalse( + "/" in split_name, "Split names cannot contain '/'" + ) + + self.assertIn("task_type", metadata) + self.assertIsInstance(metadata["task_type"], TaskType) + + if metadata["task_type"] in [ + TaskType.Classification, + TaskType.SequenceLabeling, + TaskType.MultiLabelClassification, + ]: + self.assertIn("class_labels", metadata) + self.assertIsInstance(metadata["class_labels"], list) + elif metadata["task_type"] == TaskType.Regression: + self.assertIn("score_range", metadata) + self.assertIsInstance(metadata["score_range"], tuple) diff --git a/tests/models/test_HuggingFaceInferenceAPI.py b/tests/models/test_HuggingFaceInferenceAPI.py index 0f93cf3f..41ee5663 100644 --- a/tests/models/test_HuggingFaceInferenceAPI.py +++ b/tests/models/test_HuggingFaceInferenceAPI.py @@ -4,6 +4,8 @@ from llmebench import Benchmark from llmebench.models import HuggingFaceInferenceAPIModel, HuggingFaceTaskTypes +from llmebench.utils import is_fewshot_asset + class TestAssetsForHuggingFaceInferenceAPIPrompts(unittest.TestCase): @classmethod @@ -29,7 +31,7 @@ def test_huggingface_inference_api_prompts(self): config = asset["config"] dataset = config["dataset"](**config["dataset_args"]) data_sample = dataset.get_data_sample() - if "fewshot" in config["general_args"]: + if is_fewshot_asset(config, asset["module"].prompt): prompt = asset["module"].prompt( data_sample["input"], [data_sample for _ in range(n_shots)], diff --git a/tests/models/test_OpenAIModel.py b/tests/models/test_OpenAIModel.py index 7f2ee1ba..292b3e22 100644 --- a/tests/models/test_OpenAIModel.py +++ b/tests/models/test_OpenAIModel.py @@ -6,6 +6,8 @@ from llmebench import Benchmark from llmebench.models import OpenAIModel +from llmebench.utils import is_fewshot_asset + class TestAssetsForOpenAIPrompts(unittest.TestCase): @classmethod @@ -29,7 +31,7 @@ def test_openai_prompts(self): config = asset["config"] dataset = config["dataset"](**config["dataset_args"]) data_sample = dataset.get_data_sample() - if "fewshot" in config["general_args"]: + if is_fewshot_asset(config, asset["module"].prompt): prompt = asset["module"].prompt( data_sample["input"], [data_sample for _ in range(n_shots)], diff --git a/tests/models/test_Petals.py b/tests/models/test_Petals.py index e661d227..713656b1 100644 --- a/tests/models/test_Petals.py +++ b/tests/models/test_Petals.py @@ -4,6 +4,8 @@ from llmebench import Benchmark from llmebench.models import PetalsModel +from llmebench.utils import is_fewshot_asset + class TestAssetsForPetalsPrompts(unittest.TestCase): @classmethod @@ -27,7 +29,7 @@ def test_petals_prompts(self): config = asset["config"] dataset = config["dataset"](**config["dataset_args"]) data_sample = dataset.get_data_sample() - if "fewshot" in config["general_args"]: + if is_fewshot_asset(config, asset["module"].prompt): prompt = asset["module"].prompt( data_sample["input"], [data_sample for _ in range(n_shots)], diff --git a/tests/tasks/test_implementation.py b/tests/tasks/test_implementation.py index fb21ded5..a50145bc 100644 --- a/tests/tasks/test_implementation.py +++ b/tests/tasks/test_implementation.py @@ -4,6 +4,7 @@ from pathlib import Path import llmebench.tasks as tasks +from llmebench.tasks.task_base import TaskBase from tests.utils import base_class_constructor_checker @@ -13,7 +14,13 @@ class TestTaskImplementation(unittest.TestCase): def setUpClass(cls): # Search for all implemented models framework_dir = Path("llmebench") - cls.tasks = set([m[1] for m in inspect.getmembers(tasks, inspect.isclass)]) + cls.tasks = set( + [ + m[1] + for m in inspect.getmembers(tasks, inspect.isclass) + if issubclass(m[1], TaskBase) + ] + ) def test_base_constructor(self): "Test if all tasks also call the base class constructor" diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 800866c5..4ee1f81c 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -17,14 +17,47 @@ class MockDataset(DatasetBase): - def metadata(self): - return {} + @staticmethod + def metadata(): + return { + "splits": { + "train": ["default_train_data1", "default_train_data2"], + "dev": ["default_dev_data1", "default_dev_data2"], + "test": ["default_test_data1", "default_test_data2"], + } + } - def get_data_sample(self): + @staticmethod + def get_data_sample(): return {"input": "input", "label": "label"} def load_data(self, data_path): - return [self.get_data_sample() for _ in range(100)] + return [{"input_id": i, **self.get_data_sample()} for i in data_path] + + +class MockDatasetWithMultiLevelSplits(DatasetBase): + @staticmethod + def metadata(): + return { + "splits": { + "ar": { + "dev": ["default_ar_dev_data1", "default_ar_dev_data2"], + "test": ["default_ar_test_data1", "default_ar_test_data2"], + }, + "en": { + "dev": ["default_en_dev_data1", "default_en_dev_data2"], + "test": ["default_en_test_data1", "default_en_test_data2"], + }, + "default": ["ar", "en"], + } + } + + @staticmethod + def get_data_sample(): + return {"input": "input", "label": "label"} + + def load_data(self, data_path): + return [{"input_id": i, **self.get_data_sample()} for i in data_path] class MockModel(ModelBase): @@ -45,12 +78,8 @@ class MockAsset(object): def config(): return { "dataset": MockDataset, - "dataset_args": {}, "task": MockTask, - "task_args": {}, "model": MockModel, - "model_args": {}, - "general_args": {"data_path": "fake/path/to/data"}, } @staticmethod @@ -62,6 +91,24 @@ def post_process(response): return response +class MockFewShotAsset(object): + @staticmethod + def config(): + return { + "dataset": MockDataset, + "task": MockTask, + "model": MockModel, + } + + @staticmethod + def prompt(input_sample, samples): + return {"prompt": input_sample} + + @staticmethod + def post_process(response): + return response + + class MockFailingAsset(MockAsset): def prompt(input_sample): raise Exception("Fail!") @@ -300,3 +347,282 @@ def test_multi_config_asset(self, asset_importer_mock): for subconfig in config: self.assertIn(f"sample/{subconfig['name']}", results) + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_asset_with_default_splits(self, asset_finder_mock): + "Run benchmark with an asset and its default splits" + asset_finder_mock.return_value = [ + { + "name": "MockAsset 1", + "config": MockAsset.config(), + "module": MockAsset, + } + ] + + testargs = ["llmebench", self.benchmark_dir.name, self.results_dir.name] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 1) + + with open(Path(self.results_dir.name) / "MockAsset 1" / "0.json") as fp: + cache_obj = json.load(fp) + self.assertEqual(cache_obj["input"]["input_id"], "default_test_data1") + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_asset_with_selected_split(self, asset_finder_mock): + "Run benchmark with an asset and a selected split" + asset_finder_mock.return_value = [ + { + "name": "MockAsset 1", + "config": {**MockAsset.config(), "general_args": {"test_split": "dev"}}, + "module": MockAsset, + } + ] + + testargs = ["llmebench", self.benchmark_dir.name, self.results_dir.name] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 1) + + with open(Path(self.results_dir.name) / "MockAsset 1" / "0.json") as fp: + cache_obj = json.load(fp) + self.assertEqual(cache_obj["input"]["input_id"], "default_dev_data1") + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_asset_with_selected_splits(self, asset_finder_mock): + "Run benchmark with an asset and a selected splits" + asset_finder_mock.return_value = [ + { + "name": "MockAsset 1", + "config": { + **MockAsset.config(), + "general_args": {"test_split": ["dev", "test"]}, + }, + "module": MockAsset, + } + ] + + testargs = ["llmebench", self.benchmark_dir.name, self.results_dir.name] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 2) + + with open( + Path(self.results_dir.name) / "MockAsset 1" / "dev" / "0.json" + ) as fp: + cache_obj = json.load(fp) + self.assertEqual(cache_obj["input"]["input_id"], "default_dev_data1") + + with open( + Path(self.results_dir.name) / "MockAsset 1" / "test" / "0.json" + ) as fp: + cache_obj = json.load(fp) + self.assertEqual(cache_obj["input"]["input_id"], "default_test_data1") + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_asset_with_custom_splits(self, asset_finder_mock): + "Run benchmark with an asset and a custom split" + asset_finder_mock.return_value = [ + { + "name": "MockAsset 1", + "config": { + **MockAsset.config(), + "general_args": { + "custom_test_split": ["custom_data_1", "custom_data_2"] + }, + }, + "module": MockAsset, + } + ] + + testargs = ["llmebench", self.benchmark_dir.name, self.results_dir.name] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 1) + + with open(Path(self.results_dir.name) / "MockAsset 1" / "0.json") as fp: + cache_obj = json.load(fp) + self.assertEqual(cache_obj["input"]["input_id"], "custom_data_1") + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_asset_with_default_splits_multilevel(self, asset_finder_mock): + "Run benchmark with an asset (containing multi-level splits) and its default splits" + asset_finder_mock.return_value = [ + { + "name": "MockAsset 1", + "config": { + **MockAsset.config(), + "dataset": MockDatasetWithMultiLevelSplits, + }, + "module": MockAsset, + } + ] + + testargs = ["llmebench", self.benchmark_dir.name, self.results_dir.name] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 2) + + with open( + Path(self.results_dir.name) / "MockAsset 1" / "ar" / "0.json" + ) as fp: + cache_obj = json.load(fp) + self.assertEqual( + cache_obj["input"]["input_id"], "default_ar_test_data1" + ) + + with open( + Path(self.results_dir.name) / "MockAsset 1" / "en" / "0.json" + ) as fp: + cache_obj = json.load(fp) + self.assertEqual( + cache_obj["input"]["input_id"], "default_en_test_data1" + ) + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_asset_with_selected_split_multilevel(self, asset_finder_mock): + "Run benchmark with an asset (containing multi-level splits) and selected split" + asset_finder_mock.return_value = [ + { + "name": "MockAsset 1", + "config": { + **MockAsset.config(), + "dataset": MockDatasetWithMultiLevelSplits, + "general_args": {"test_split": ["ar/dev"]}, + }, + "module": MockAsset, + } + ] + + testargs = ["llmebench", self.benchmark_dir.name, self.results_dir.name] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 1) + + with open(Path(self.results_dir.name) / "MockAsset 1" / "0.json") as fp: + cache_obj = json.load(fp) + self.assertEqual(cache_obj["input"]["input_id"], "default_ar_dev_data1") + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_asset_with_selected_split_multilevel_shorthand(self, asset_finder_mock): + "Run benchmark with an asset (containing multi-level splits) and selected split (using shorthand)" + asset_finder_mock.return_value = [ + { + "name": "MockAsset 1", + "config": { + **MockAsset.config(), + "dataset": MockDatasetWithMultiLevelSplits, + "general_args": {"test_split": ["ar"]}, + }, + "module": MockAsset, + } + ] + + testargs = ["llmebench", self.benchmark_dir.name, self.results_dir.name] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 1) + + with open(Path(self.results_dir.name) / "MockAsset 1" / "0.json") as fp: + cache_obj = json.load(fp) + self.assertEqual( + cache_obj["input"]["input_id"], "default_ar_test_data1" + ) + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_asset_with_selected_splits_multilevel(self, asset_finder_mock): + "Run benchmark with an asset (containing multi-level splits) and selected splits" + asset_finder_mock.return_value = [ + { + "name": "MockAsset 1", + "config": { + **MockAsset.config(), + "dataset": MockDatasetWithMultiLevelSplits, + "general_args": {"test_split": ["ar/dev", "en/dev", "en/test"]}, + }, + "module": MockAsset, + } + ] + + testargs = ["llmebench", self.benchmark_dir.name, self.results_dir.name] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 3) + + with open( + Path(self.results_dir.name) / "MockAsset 1" / "ar" / "dev" / "0.json" + ) as fp: + cache_obj = json.load(fp) + self.assertEqual(cache_obj["input"]["input_id"], "default_ar_dev_data1") + + with open( + Path(self.results_dir.name) / "MockAsset 1" / "en" / "dev" / "0.json" + ) as fp: + cache_obj = json.load(fp) + self.assertEqual(cache_obj["input"]["input_id"], "default_en_dev_data1") + + with open( + Path(self.results_dir.name) / "MockAsset 1" / "en" / "test" / "0.json" + ) as fp: + cache_obj = json.load(fp) + self.assertEqual( + cache_obj["input"]["input_id"], "default_en_test_data1" + ) + + @patch("llmebench.benchmark.Benchmark.find_assets") + def test_fewshot_asset_with_default_splits(self, asset_finder_mock): + "Run benchmark with an asset and its default splits" + asset_finder_mock.return_value = [ + { + "name": "MockFewShotAsset 1", + "config": MockFewShotAsset.config(), + "module": MockFewShotAsset, + } + ] + + testargs = [ + "llmebench", + "--n_shots", + "3", + self.benchmark_dir.name, + self.results_dir.name, + ] + with patch.object(sys, "argv", testargs): + llmebench.benchmark.main() + + with open(Path(self.results_dir.name) / "all_results.json") as fp: + results = json.load(fp) + self.assertEqual(len(results), 1) + + print(list((Path(self.results_dir.name) / "MockFewShotAsset 1").iterdir())) + + with open( + Path(self.results_dir.name) / "MockFewShotAsset 1" / "3_shot" / "0.json" + ) as fp: + cache_obj = json.load(fp) + for fse in cache_obj["few_shot_examples"]: + self.assertIn("train", fse["input_id"]) diff --git a/tests/test_benchmark_assets.py b/tests/test_benchmark_assets.py index e13c63d0..349c51ca 100644 --- a/tests/test_benchmark_assets.py +++ b/tests/test_benchmark_assets.py @@ -23,15 +23,18 @@ def test_required_functions(self): def validate_single_config(self, config): self.assertIn("dataset", config) - self.assertIn("dataset_args", config) + + if "dataset_args" in config: + self.assertIsInstance(config["dataset_args"], dict) self.assertIn("task", config) - self.assertIn("task_args", config) + if "task_args" in config: + self.assertIsInstance(config["task_args"], dict) self.assertIn("model", config) - self.assertIn("model_args", config) - self.assertIn("general_args", config) + if "model_args" in config: + self.assertIsInstance(config["model_args"], dict) - if "fewshot" in config["general_args"]: - self.assertIn("train_data_path", config["general_args"]["fewshot"]) + if "general_args" in config: + self.assertIsInstance(config["general_args"], dict) def test_config_format(self): "Test if all configs are well defined"