From da354fa85ec59dec7a3d7566245d451077039702 Mon Sep 17 00:00:00 2001 From: Maram Hasanain Date: Mon, 28 Aug 2023 14:52:25 +0300 Subject: [PATCH 1/2] Created DialectQADI_BLOOMZ_ZeroShot.py --- .../DialectQADI_BLOOMZ_ZeroShot.py | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectQADI_BLOOMZ_ZeroShot.py diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectQADI_BLOOMZ_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectQADI_BLOOMZ_ZeroShot.py new file mode 100644 index 00000000..9f30ab4d --- /dev/null +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/DialectQADI_BLOOMZ_ZeroShot.py @@ -0,0 +1,94 @@ +import os + +from llmebench.datasets import QADIDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import DialectIDTask + +def config(): + return { + "dataset": QADIDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": BLOOMPetalModel, + "model_args": { + "api_url": os.environ["API_URL"], + "class_labels": [ + "EG", + "DZ", + "SD", + "YE", + "SY", + "TN", + "AE", + "JO", + "LY", + "PS", + "OM", + "LB", + "KW", + "QA", + "BH", + "MSA", + "SA", + "IQ", + "MA", + ], + "max_tries": 0, + }, + "general_args": { + "data_path": "data/sequence_tagging_ner_pos_etc/dialect_identification/QADI_test-PalestinePS-corrected.txt", + }, + } + + +def prompt(input_sample): + prompt_string = ( + f'Identify the dialect of the following Arabic "text" given the following possible list of dialects: “Egyptian”, “Algerian”, "Sudanese", "Yemeni", "Syrian", “Tunisian”, "Emirati", "Jordanian", "Libyan", "Palestinian", "Omani", “Lebanese”, “Kuwaiti”, "Qatari", "Bahrani", "modern standard Arabic", "Saudi", "Iraqi", "Moroccan"\n\n' + f"text: {input_sample}\n" + f"label: \n" + ) + + return { + "prompt": prompt_string, + } + + +def post_process(response): + count_label_map = { + "Egyptian": "EG", + "Algerian": "DZ", + "Sudanese": "SD", + "Yemeni": "YE", + "Syrian": "SY", + "Tunisian": "TN", + "Emirati": "AE", + "Jordanian": "JO", + "Libyan": "LY", + "Palestinian": "PS", + "Omani": "OM", + "Lebanese": "LB", + "Kuwaiti": "KW", + "Qatari": "QA", + "Bahrani": "BH", + "modern standard Arabic": "MSA", + "Modern standard Arabic": "MSA", + "Modern Standard Arabic": "MSA", + "Saudi": "SA", + "Iraqi": "IQ", + "Moroccan": "MA", + } + + label = response["outputs"].strip() + label = label.replace("", "") + label = label.replace("", "") + label = label.replace("Dialect: ", "").replace("dialect: ","") + label = label.replace("label: ", "") + label = label.strip() + + if label in count_label_map: + label_fixed = count_label_map[label] + else: + label_fixed = None + + return label_fixed From 5306c0d8401f461dcacbd466361cd7bdb26acffc Mon Sep 17 00:00:00 2001 From: Fahim Imaduddin Dalvi Date: Thu, 7 Sep 2023 13:13:15 +0300 Subject: [PATCH 2/2] Code cleanup --- ...lectQADI_BLOOMZ_ZeroShot.py => QADI_BLOOMZ_ZeroShot.py} | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) rename assets/ar/sequence_tagging_and_information_extraction/dialect_identification/{DialectQADI_BLOOMZ_ZeroShot.py => QADI_BLOOMZ_ZeroShot.py} (94%) diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/DialectQADI_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_BLOOMZ_ZeroShot.py similarity index 94% rename from assets/ar/sequence_tagging_and_information_extraction/dialect_identification/DialectQADI_BLOOMZ_ZeroShot.py rename to assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_BLOOMZ_ZeroShot.py index 9f30ab4d..42c82607 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/DialectQADI_BLOOMZ_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_BLOOMZ_ZeroShot.py @@ -1,16 +1,17 @@ import os from llmebench.datasets import QADIDataset -from llmebench.models import BLOOMPetalModel +from llmebench.models import PetalsModel from llmebench.tasks import DialectIDTask + def config(): return { "dataset": QADIDataset, "dataset_args": {}, "task": DialectIDTask, "task_args": {}, - "model": BLOOMPetalModel, + "model": PetalsModel, "model_args": { "api_url": os.environ["API_URL"], "class_labels": [ @@ -82,7 +83,7 @@ def post_process(response): label = response["outputs"].strip() label = label.replace("", "") label = label.replace("", "") - label = label.replace("Dialect: ", "").replace("dialect: ","") + label = label.replace("Dialect: ", "").replace("dialect: ", "") label = label.replace("label: ", "") label = label.strip()