From 97dae47e99a7b6af5b2b4040135da908949c697b Mon Sep 17 00:00:00 2001 From: Basel Mousi Date: Sun, 27 Aug 2023 14:30:49 +0300 Subject: [PATCH 1/7] add temp mt file --- .../MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py index 68db7bb7..a550b839 100644 --- a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py +++ b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py @@ -44,11 +44,11 @@ def config(): "madar.test.nil.0.sd", "madar.test.nil.1.eg", "madar.test.nil.2.eg", - "summa-2M.test.mgr.0.ma", - "summa-AJ.test.msa.0.ms", - "summa-BBC.test.msa.0.ms", - "summa-LBC.test.lev.0.lb", - "summa-Oman.test.glf.0.om", + # "summa-2M.test.mgr.0.ma", + # "summa-AJ.test.msa.0.ms", + # "summa-BBC.test.msa.0.ms", + # "summa-LBC.test.lev.0.lb", + # "summa-Oman.test.glf.0.om", ] configs = [] for testset in sets: From 2e11bca6d0252fe566b3fb953b489b1861e13e79 Mon Sep 17 00:00:00 2001 From: Basel Mousi Date: Sat, 9 Sep 2023 15:01:18 +0300 Subject: [PATCH 2/7] added data loader for the Shami Corpus --- llmebench/datasets/ShamiCorpus.py | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 llmebench/datasets/ShamiCorpus.py diff --git a/llmebench/datasets/ShamiCorpus.py b/llmebench/datasets/ShamiCorpus.py new file mode 100644 index 00000000..39907147 --- /dev/null +++ b/llmebench/datasets/ShamiCorpus.py @@ -0,0 +1,40 @@ +from llmebench.datasets.dataset_base import DatasetBase +from pathlib import Path + +class ShamiDataset(DatasetBase): + def __init__(self, **kwargs): + super(ShamiDataset, self).__init__(**kwargs) + def metadata(): + return { + "language":"ar", + "citation": """ @inproceedings{abu-kwaik-etal-2018-shami, + title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects", + author = "Abu Kwaik, Kathrein and + Saad, Motaz and + Chatzikyriakidis, Stergios and + Dobnik, Simon", + booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", + month = may, + year = "2018", + address = "Miyazaki, Japan", + publisher = "European Language Resources Association (ELRA)", + url = "https://aclanthology.org/L18-1576", + } +""" + } + def get_data_sample(self): + return {"input": "a sentence", "label": "dialect of sentence"} + + def load_data(self, data_path, no_labels=False): + data = [] + filenames= ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"] + for name in filenames: + path = Path(data_path) / name + with open(path, "r") as reader: + for line in reader: + sentence = line.strip() + label = name.split(".")[0] + data.append( + {"input": sentence, "label": label} + ) + return data From 80aaeb419b870c74faf13baea69d08d1d48c1503 Mon Sep 17 00:00:00 2001 From: Basel Mousi Date: Sat, 9 Sep 2023 15:41:15 +0300 Subject: [PATCH 3/7] exported the dataset in init file --- llmebench/datasets/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py index 65cba2c7..2c47d72b 100644 --- a/llmebench/datasets/__init__.py +++ b/llmebench/datasets/__init__.py @@ -49,3 +49,4 @@ from .XGLUEPOS import XGLUEPOSDataset from .XNLI import XNLIDataset from .XQuAD import XQuADDataset +from .ShamiCorpus import ShamiDataset \ No newline at end of file From 645958ebc2fb0727ce05106704a00f7e95702759 Mon Sep 17 00:00:00 2001 From: Basel Mousi Date: Sat, 9 Sep 2023 15:42:07 +0300 Subject: [PATCH 4/7] added a chatgpt ZS asset for dialect identification on Shami Corpus --- .../ShamiCorpus_GPT35_ZeroShot.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py new file mode 100644 index 00000000..62852413 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py @@ -0,0 +1,54 @@ +import os +from llmebench.datasets import ShamiDataset +from llmebench.models import LegacyOpenAIModel +from llmebench.tasks import DialectIDTask + + + +def config(): + return { + "dataset": ShamiDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": LegacyOpenAIModel, + "model_args": { + "api_type": "azure", + "api_version": "2023-03-15-preview", + "api_base": os.environ["AZURE_API_URL"], + "api_key": os.environ["AZURE_API_KEY"], + "engine_name": os.environ["ENGINE_NAME"], + "class_labels": [ + "Lebanese", + "Jordanian", + "Palestinian", + "Syrian" + + ], + "max_tries": 3, + }, + "general_args": { + "data_path": "data/dialect-data/shami-corpus" + }, + } + + +def prompt(input_sample): + prompt_string = ( + f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " + ) + + return { + "system_message": "You are an AI assistant that helps people find information.", + "messages": [ + { + "sender": "user", + "text": prompt_string + } + ] + } + + +def post_process(response): + label = response["choices"][0]["text"] + return label \ No newline at end of file From a2ca6d553d8854215ee782d9e2a177866999c602 Mon Sep 17 00:00:00 2001 From: Basel Mousi Date: Sat, 9 Sep 2023 15:50:04 +0300 Subject: [PATCH 5/7] Added gpt4 zero-shot asset for dialect identification on Shami Corpus --- .../ShamiCorpus_GPT4_ZeroShot.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py new file mode 100644 index 00000000..8a7217c9 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py @@ -0,0 +1,55 @@ +import os +from llmebench.datasets import ShamiDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import DialectIDTask + + + +def config(): + return { + "dataset": ShamiDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "api_type": "azure", + "api_version": "2023-03-15-preview", + "api_base": os.environ["AZURE_API_URL"], + "api_key": os.environ["AZURE_API_KEY"], + "engine_name": os.environ["ENGINE_NAME"], + "class_labels": [ + "Lebanese", + "Jordanian", + "Palestinian", + "Syrian" + + ], + "max_tries": 3, + }, + "general_args": { + "data_path": "data/dialect-data/shami-corpus" + }, + } + + +def prompt(input_sample): + prompt_string = ( + f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " + ) + + return [ + { + "role": "system", + "content": "You are an AI assistant that helps people find information.", + }, + { + "role": "user", + "content": prompt_string, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + return label \ No newline at end of file From 2448d547f3dcaf597800cf3b6d5c0813bc777b24 Mon Sep 17 00:00:00 2001 From: Basel Mousi Date: Sat, 9 Sep 2023 16:14:43 +0300 Subject: [PATCH 6/7] Added zero shot asset for dialect identification - bloom --- .../ShamiCorpus_BLOOMZ_ZeroShot.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py new file mode 100644 index 00000000..5e765df4 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py @@ -0,0 +1,51 @@ +import os + +from llmebench.datasets import ShamiDataset +from llmebench.models import PetalsModel +from llmebench.tasks import DialectIDTask + + +def config(): + return { + "dataset": ShamiDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": PetalsModel, + "model_args": { + "api_url": os.environ["API_URL"], + "class_labels": [ + + "Lebanese", + "Jordanian", + "Palestinian", + "Syrian" + ], + "max_tries": 22, + }, + "general_args": { + "data_path": "data/dialect-data/shami-corpus", + }, + } + + +def prompt(input_sample): + prompt_string = ( + f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " + ) + + return { + "prompt": prompt_string, + } + + +def post_process(response): + label = response["outputs"].strip() + #label = label.replace("", "") + label = label.replace("", "") + # label = label.replace("Dialect: ", "").replace("dialect: ", "") + # label = label.replace("label: ", "") + # label = label.strip() + + + return label From f0fb456e3de531fe7ff1e1c15bb01ee1f840268f Mon Sep 17 00:00:00 2001 From: Basel Mousi Date: Sat, 9 Sep 2023 16:18:00 +0300 Subject: [PATCH 7/7] formatted code for shami corpus dialect identification --- .../ShamiCorpus_BLOOMZ_ZeroShot.py | 15 ++----- .../ShamiCorpus_GPT35_ZeroShot.py | 35 +++++---------- .../ShamiCorpus_GPT4_ZeroShot.py | 24 +++------- llmebench/datasets/ShamiCorpus.py | 44 ++++++++++--------- llmebench/datasets/__init__.py | 2 +- 5 files changed, 44 insertions(+), 76 deletions(-) diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py index 5e765df4..573ecf0f 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py @@ -14,13 +14,7 @@ def config(): "model": PetalsModel, "model_args": { "api_url": os.environ["API_URL"], - "class_labels": [ - - "Lebanese", - "Jordanian", - "Palestinian", - "Syrian" - ], + "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"], "max_tries": 22, }, "general_args": { @@ -30,9 +24,7 @@ def config(): def prompt(input_sample): - prompt_string = ( - f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " - ) + prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " return { "prompt": prompt_string, @@ -41,11 +33,10 @@ def prompt(input_sample): def post_process(response): label = response["outputs"].strip() - #label = label.replace("", "") + # label = label.replace("", "") label = label.replace("", "") # label = label.replace("Dialect: ", "").replace("dialect: ", "") # label = label.replace("label: ", "") # label = label.strip() - return label diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py index 62852413..610f3c6d 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py @@ -1,10 +1,10 @@ import os + from llmebench.datasets import ShamiDataset from llmebench.models import LegacyOpenAIModel from llmebench.tasks import DialectIDTask - def config(): return { "dataset": ShamiDataset, @@ -18,37 +18,22 @@ def config(): "api_base": os.environ["AZURE_API_URL"], "api_key": os.environ["AZURE_API_KEY"], "engine_name": os.environ["ENGINE_NAME"], - "class_labels": [ - "Lebanese", - "Jordanian", - "Palestinian", - "Syrian" - - ], + "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"], "max_tries": 3, }, - "general_args": { - "data_path": "data/dialect-data/shami-corpus" - }, + "general_args": {"data_path": "data/dialect-data/shami-corpus"}, } -def prompt(input_sample): - prompt_string = ( - f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " - ) +def prompt(input_sample): + prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " - return { - "system_message": "You are an AI assistant that helps people find information.", - "messages": [ - { - "sender": "user", - "text": prompt_string - } - ] + return { + "system_message": "You are an AI assistant that helps people find information.", + "messages": [{"sender": "user", "text": prompt_string}], } -def post_process(response): +def post_process(response): label = response["choices"][0]["text"] - return label \ No newline at end of file + return label diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py index 8a7217c9..f2480e82 100644 --- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py @@ -1,10 +1,10 @@ import os + from llmebench.datasets import ShamiDataset from llmebench.models import OpenAIModel from llmebench.tasks import DialectIDTask - def config(): return { "dataset": ShamiDataset, @@ -18,25 +18,15 @@ def config(): "api_base": os.environ["AZURE_API_URL"], "api_key": os.environ["AZURE_API_KEY"], "engine_name": os.environ["ENGINE_NAME"], - "class_labels": [ - "Lebanese", - "Jordanian", - "Palestinian", - "Syrian" - - ], + "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"], "max_tries": 3, }, - "general_args": { - "data_path": "data/dialect-data/shami-corpus" - }, + "general_args": {"data_path": "data/dialect-data/shami-corpus"}, } -def prompt(input_sample): - prompt_string = ( - f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " - ) +def prompt(input_sample): + prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " return [ { @@ -50,6 +40,6 @@ def prompt(input_sample): ] -def post_process(response): +def post_process(response): label = response["choices"][0]["message"]["content"] - return label \ No newline at end of file + return label diff --git a/llmebench/datasets/ShamiCorpus.py b/llmebench/datasets/ShamiCorpus.py index 39907147..ca6503b9 100644 --- a/llmebench/datasets/ShamiCorpus.py +++ b/llmebench/datasets/ShamiCorpus.py @@ -1,12 +1,15 @@ -from llmebench.datasets.dataset_base import DatasetBase from pathlib import Path -class ShamiDataset(DatasetBase): - def __init__(self, **kwargs): - super(ShamiDataset, self).__init__(**kwargs) - def metadata(): - return { - "language":"ar", +from llmebench.datasets.dataset_base import DatasetBase + + +class ShamiDataset(DatasetBase): + def __init__(self, **kwargs): + super(ShamiDataset, self).__init__(**kwargs) + + def metadata(): + return { + "language": "ar", "citation": """ @inproceedings{abu-kwaik-etal-2018-shami, title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects", author = "Abu Kwaik, Kathrein and @@ -20,21 +23,20 @@ def metadata(): publisher = "European Language Resources Association (ELRA)", url = "https://aclanthology.org/L18-1576", } -""" +""", } - def get_data_sample(self): - return {"input": "a sentence", "label": "dialect of sentence"} - - def load_data(self, data_path, no_labels=False): + + def get_data_sample(self): + return {"input": "a sentence", "label": "dialect of sentence"} + + def load_data(self, data_path, no_labels=False): data = [] - filenames= ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"] - for name in filenames: - path = Path(data_path) / name - with open(path, "r") as reader: - for line in reader: - sentence = line.strip() + filenames = ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"] + for name in filenames: + path = Path(data_path) / name + with open(path, "r") as reader: + for line in reader: + sentence = line.strip() label = name.split(".")[0] - data.append( - {"input": sentence, "label": label} - ) + data.append({"input": sentence, "label": label}) return data diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py index 2c47d72b..54c0b9a3 100644 --- a/llmebench/datasets/__init__.py +++ b/llmebench/datasets/__init__.py @@ -36,6 +36,7 @@ from .SemEval17T1STS import SemEval17T1STSDataset from .SemEval17T2STS import SemEval17T2STSDataset from .SemEval23T3Propaganda import SemEval23T3PropagandaDataset +from .ShamiCorpus import ShamiDataset from .Spam import SpamDataset from .STSQ2Q import Q2QSimDataset from .TyDiQA import TyDiQADataset @@ -49,4 +50,3 @@ from .XGLUEPOS import XGLUEPOSDataset from .XNLI import XNLIDataset from .XQuAD import XQuADDataset -from .ShamiCorpus import ShamiDataset \ No newline at end of file