Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/dialect identification shami corpus #207

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os

from llmebench.datasets import ShamiDataset
from llmebench.models import PetalsModel
from llmebench.tasks import DialectIDTask


def config():
return {
"dataset": ShamiDataset,
"dataset_args": {},
"task": DialectIDTask,
"task_args": {},
"model": PetalsModel,
"model_args": {
"api_url": os.environ["API_URL"],
"class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
"max_tries": 22,
},
"general_args": {
"data_path": "data/dialect-data/shami-corpus",
},
}


def prompt(input_sample):
prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "

return {
"prompt": prompt_string,
}


def post_process(response):
label = response["outputs"].strip()
# label = label.replace("<s>", "")
label = label.replace("</s>", "")
# label = label.replace("Dialect: ", "").replace("dialect: ", "")
# label = label.replace("label: ", "")
# label = label.strip()

return label
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os

from llmebench.datasets import ShamiDataset
from llmebench.models import LegacyOpenAIModel
from llmebench.tasks import DialectIDTask


def config():
return {
"dataset": ShamiDataset,
"dataset_args": {},
"task": DialectIDTask,
"task_args": {},
"model": LegacyOpenAIModel,
"model_args": {
"api_type": "azure",
"api_version": "2023-03-15-preview",
"api_base": os.environ["AZURE_API_URL"],
"api_key": os.environ["AZURE_API_KEY"],
"engine_name": os.environ["ENGINE_NAME"],
"class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
"max_tries": 3,
},
"general_args": {"data_path": "data/dialect-data/shami-corpus"},
}


def prompt(input_sample):
prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "

return {
"system_message": "You are an AI assistant that helps people find information.",
"messages": [{"sender": "user", "text": prompt_string}],
}


def post_process(response):
label = response["choices"][0]["text"]
return label
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os

from llmebench.datasets import ShamiDataset
from llmebench.models import OpenAIModel
from llmebench.tasks import DialectIDTask


def config():
return {
"dataset": ShamiDataset,
"dataset_args": {},
"task": DialectIDTask,
"task_args": {},
"model": OpenAIModel,
"model_args": {
"api_type": "azure",
"api_version": "2023-03-15-preview",
"api_base": os.environ["AZURE_API_URL"],
"api_key": os.environ["AZURE_API_KEY"],
"engine_name": os.environ["ENGINE_NAME"],
"class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
"max_tries": 3,
},
"general_args": {"data_path": "data/dialect-data/shami-corpus"},
}


def prompt(input_sample):
prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "

return [
{
"role": "system",
"content": "You are an AI assistant that helps people find information.",
},
{
"role": "user",
"content": prompt_string,
},
]


def post_process(response):
label = response["choices"][0]["message"]["content"]
return label
42 changes: 42 additions & 0 deletions llmebench/datasets/ShamiCorpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from pathlib import Path

from llmebench.datasets.dataset_base import DatasetBase


class ShamiDataset(DatasetBase):
def __init__(self, **kwargs):
super(ShamiDataset, self).__init__(**kwargs)

def metadata():
return {
"language": "ar",
"citation": """ @inproceedings{abu-kwaik-etal-2018-shami,
title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects",
author = "Abu Kwaik, Kathrein and
Saad, Motaz and
Chatzikyriakidis, Stergios and
Dobnik, Simon",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1576",
}
""",
}

def get_data_sample(self):
return {"input": "a sentence", "label": "dialect of sentence"}

def load_data(self, data_path, no_labels=False):
data = []
filenames = ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"]
for name in filenames:
path = Path(data_path) / name
with open(path, "r") as reader:
for line in reader:
sentence = line.strip()
label = name.split(".")[0]
data.append({"input": sentence, "label": label})
return data
1 change: 1 addition & 0 deletions llmebench/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from .SemEval17T1STS import SemEval17T1STSDataset
from .SemEval17T2STS import SemEval17T2STSDataset
from .SemEval23T3Propaganda import SemEval23T3PropagandaDataset
from .ShamiCorpus import ShamiDataset
from .Spam import SpamDataset
from .STSQ2Q import Q2QSimDataset
from .TyDiQA import TyDiQADataset
Expand Down
Loading