diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py new file mode 100644 index 00000000..573ecf0f --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py @@ -0,0 +1,42 @@ +import os + +from llmebench.datasets import ShamiDataset +from llmebench.models import PetalsModel +from llmebench.tasks import DialectIDTask + + +def config(): + return { + "dataset": ShamiDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": PetalsModel, + "model_args": { + "api_url": os.environ["API_URL"], + "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"], + "max_tries": 22, + }, + "general_args": { + "data_path": "data/dialect-data/shami-corpus", + }, + } + + +def prompt(input_sample): + prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " + + return { + "prompt": prompt_string, + } + + +def post_process(response): + label = response["outputs"].strip() + # label = label.replace("", "") + label = label.replace("", "") + # label = label.replace("Dialect: ", "").replace("dialect: ", "") + # label = label.replace("label: ", "") + # label = label.strip() + + return label diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py new file mode 100644 index 00000000..610f3c6d --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py @@ -0,0 +1,39 @@ +import os + +from llmebench.datasets import ShamiDataset +from llmebench.models import LegacyOpenAIModel +from llmebench.tasks import DialectIDTask + + +def config(): + return { + "dataset": ShamiDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": LegacyOpenAIModel, + "model_args": { + "api_type": "azure", + "api_version": "2023-03-15-preview", + "api_base": os.environ["AZURE_API_URL"], + "api_key": os.environ["AZURE_API_KEY"], + "engine_name": os.environ["ENGINE_NAME"], + "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"], + "max_tries": 3, + }, + "general_args": {"data_path": "data/dialect-data/shami-corpus"}, + } + + +def prompt(input_sample): + prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " + + return { + "system_message": "You are an AI assistant that helps people find information.", + "messages": [{"sender": "user", "text": prompt_string}], + } + + +def post_process(response): + label = response["choices"][0]["text"] + return label diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py new file mode 100644 index 00000000..f2480e82 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py @@ -0,0 +1,45 @@ +import os + +from llmebench.datasets import ShamiDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import DialectIDTask + + +def config(): + return { + "dataset": ShamiDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": OpenAIModel, + "model_args": { + "api_type": "azure", + "api_version": "2023-03-15-preview", + "api_base": os.environ["AZURE_API_URL"], + "api_key": os.environ["AZURE_API_KEY"], + "engine_name": os.environ["ENGINE_NAME"], + "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"], + "max_tries": 3, + }, + "general_args": {"data_path": "data/dialect-data/shami-corpus"}, + } + + +def prompt(input_sample): + prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: " + + return [ + { + "role": "system", + "content": "You are an AI assistant that helps people find information.", + }, + { + "role": "user", + "content": prompt_string, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + return label diff --git a/llmebench/datasets/ShamiCorpus.py b/llmebench/datasets/ShamiCorpus.py new file mode 100644 index 00000000..ca6503b9 --- /dev/null +++ b/llmebench/datasets/ShamiCorpus.py @@ -0,0 +1,42 @@ +from pathlib import Path + +from llmebench.datasets.dataset_base import DatasetBase + + +class ShamiDataset(DatasetBase): + def __init__(self, **kwargs): + super(ShamiDataset, self).__init__(**kwargs) + + def metadata(): + return { + "language": "ar", + "citation": """ @inproceedings{abu-kwaik-etal-2018-shami, + title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects", + author = "Abu Kwaik, Kathrein and + Saad, Motaz and + Chatzikyriakidis, Stergios and + Dobnik, Simon", + booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", + month = may, + year = "2018", + address = "Miyazaki, Japan", + publisher = "European Language Resources Association (ELRA)", + url = "https://aclanthology.org/L18-1576", + } +""", + } + + def get_data_sample(self): + return {"input": "a sentence", "label": "dialect of sentence"} + + def load_data(self, data_path, no_labels=False): + data = [] + filenames = ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"] + for name in filenames: + path = Path(data_path) / name + with open(path, "r") as reader: + for line in reader: + sentence = line.strip() + label = name.split(".")[0] + data.append({"input": sentence, "label": label}) + return data diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py index 65cba2c7..54c0b9a3 100644 --- a/llmebench/datasets/__init__.py +++ b/llmebench/datasets/__init__.py @@ -36,6 +36,7 @@ from .SemEval17T1STS import SemEval17T1STSDataset from .SemEval17T2STS import SemEval17T2STSDataset from .SemEval23T3Propaganda import SemEval23T3PropagandaDataset +from .ShamiCorpus import ShamiDataset from .Spam import SpamDataset from .STSQ2Q import Q2QSimDataset from .TyDiQA import TyDiQADataset