diff --git a/assets/ar/QA/ARCD_JAIS13b_ZeroShot.py b/assets/ar/QA/ARCD_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..aeccf7d1 --- /dev/null +++ b/assets/ar/QA/ARCD_JAIS13b_ZeroShot.py @@ -0,0 +1,38 @@ +from llmebench.datasets import ARCDDataset +from llmebench.models import FastChatModel +from llmebench.tasks import QATask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + "scores": {"F1": "0.546"}, + } + + +def config(): + return { + "dataset": ARCDDataset, + "task": QATask, + "model": FastChatModel, + "model_args": { + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = f"Your task is to answer questions in Arabic based on a given context.\nNote: Your answers should be spans extracted from the given context without any illustrations.\nYou don't need to provide a complete answer\nContext:{input_sample['context']}\nQuestion:{input_sample['question']}\nAnswer:" + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + return response["choices"][0]["message"]["content"] diff --git a/assets/ar/QA/MLQA_JAIS13b_ZeroShot.py b/assets/ar/QA/MLQA_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..6868a88c --- /dev/null +++ b/assets/ar/QA/MLQA_JAIS13b_ZeroShot.py @@ -0,0 +1,38 @@ +from llmebench.datasets import MLQADataset +from llmebench.models import FastChatModel +from llmebench.tasks import QATask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + "scores": {"F1": "0.540"}, + } + + +def config(): + return { + "dataset": MLQADataset, + "task": QATask, + "model": FastChatModel, + "model_args": { + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = f"Your task is to answer questions in Arabic based on a given context.\nNote: Your answers should be spans extracted from the given context without any illustrations.\nYou don't need to provide a complete answer\nContext:{input_sample['context']}\nQuestion:{input_sample['question']}\nAnswer:" + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + return response["choices"][0]["message"]["content"] diff --git a/assets/ar/QA/TyDiQA_JAIS13b_ZeroShot.py b/assets/ar/QA/TyDiQA_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..7bad8cce --- /dev/null +++ b/assets/ar/QA/TyDiQA_JAIS13b_ZeroShot.py @@ -0,0 +1,39 @@ +from llmebench.datasets import TyDiQADataset +from llmebench.models import FastChatModel +from llmebench.tasks import QATask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + "scores": {"F1": "0.724"}, + } + + +def config(): + return { + "dataset": TyDiQADataset, + "task": QATask, + "model": FastChatModel, + "model_args": { + "max_tries": 3, + }, + "general_args": {"test_split": "dev"}, + } + + +def prompt(input_sample): + base_prompt = f"Your task is to answer questions in Arabic based on a given context.\nNote: Your answers should be spans extracted from the given context without any illustrations.\nYou don't need to provide a complete answer\nContext:{input_sample['context']}\nQuestion:{input_sample['question']}\nAnswer:" + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + return response["choices"][0]["message"]["content"] diff --git a/assets/ar/QA/XQuAD_JAIS13b_ZeroShot.py b/assets/ar/QA/XQuAD_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..16ba431a --- /dev/null +++ b/assets/ar/QA/XQuAD_JAIS13b_ZeroShot.py @@ -0,0 +1,38 @@ +from llmebench.datasets import XQuADDataset +from llmebench.models import FastChatModel +from llmebench.tasks import QATask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + "scores": {"F1": "0.636"}, + } + + +def config(): + return { + "dataset": XQuADDataset, + "task": QATask, + "model": FastChatModel, + "model_args": { + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = f"Your task is to answer questions in Arabic based on a given context.\nNote: Your answers should be spans extracted from the given context without any illustrations.\nYou don't need to provide a complete answer\nContext:{input_sample['context']}\nQuestion:{input_sample['question']}\nAnswer:" + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + return response["choices"][0]["message"]["content"] diff --git a/assets/ar/demographic_attributes/gender/ArabGend_JAIS13b_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArabGend_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..30e92e1d --- /dev/null +++ b/assets/ar/demographic_attributes/gender/ArabGend_JAIS13b_ZeroShot.py @@ -0,0 +1,47 @@ +from llmebench.datasets import ArabGendDataset +from llmebench.models import FastChatModel +from llmebench.tasks import ClassificationTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": ArabGendDataset, + "task": ClassificationTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["m", "f"], + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f"Identify the gender from the following name as 'female' or 'male'.\n\n" + f"name: {input_sample}" + f"gender: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + if label.lower() == "male": + return "m" + elif "female" in label.lower(): + return "f" + else: + return None diff --git a/assets/ar/demographic_attributes/gender/ArapTweet_JAIS13b_ZeroShot.py b/assets/ar/demographic_attributes/gender/ArapTweet_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..b72e0add --- /dev/null +++ b/assets/ar/demographic_attributes/gender/ArapTweet_JAIS13b_ZeroShot.py @@ -0,0 +1,57 @@ +from llmebench.datasets import ArapTweetDataset +from llmebench.models import FastChatModel +from llmebench.tasks import ClassificationTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": ArapTweetDataset, + "task": ClassificationTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["Female", "Male"], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f"Identify the gender from the following name as 'Female' or 'Male'.\n\n" + f"name: {input_sample}" + f"gender: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + label = label.strip() + if "gender: Female" in label or "\nFemale" in label or label == "Female": + label = "Female" + elif ( + "gender: Male" in label + or "\nMale" in label + or "likely to be 'Male'" in label + or label == "Male" + or "typically a 'Male' name" in label + ): + label = "Male" + else: + label = None + + return label diff --git a/assets/ar/demographic_attributes/location/Location_JAIS13b_ZeroShot.py b/assets/ar/demographic_attributes/location/Location_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..915cb63d --- /dev/null +++ b/assets/ar/demographic_attributes/location/Location_JAIS13b_ZeroShot.py @@ -0,0 +1,80 @@ +from llmebench.datasets import LocationDataset +from llmebench.models import FastChatModel +from llmebench.tasks import DemographyLocationTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": LocationDataset, + "task": DemographyLocationTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "ae", + "OTHERS", + "bh", + "dz", + "eg", + "iq", + "jo", + "kw", + "lb", + "ly", + "ma", + "om", + "ps", + "qa", + "sa", + "sd", + "so", + "sy", + "tn", + "UNK", + "ye", + "mr", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f"Given the following 'user location', identify and map it to its corresponding country code in accordance with ISO 3166-1 alpha-2. " + f"Please write the country code only, with no additional explanations. " + f"If the country is not an Arab country, please write 'OTHERS'. If the location doesn't map to a recognized country, write 'UNK'.\n\n" + f"user location: {input_sample}\n" + f"country code: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"].lower() + + label_list = config()["model_args"]["class_labels"] + + if "country code: " in label: + label_fixed = label.replace("country code: ", "") + elif label.lower() == "uae": + label_fixed = "ae" + elif label in label_list: + label_fixed = label + else: + label_fixed = None + + return label_fixed diff --git a/assets/ar/demographic_attributes/name_info/NameInfo_JAIS13b_ZeroShot.py b/assets/ar/demographic_attributes/name_info/NameInfo_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..1aafdab4 --- /dev/null +++ b/assets/ar/demographic_attributes/name_info/NameInfo_JAIS13b_ZeroShot.py @@ -0,0 +1,160 @@ +from llmebench.datasets import NameInfoDataset +from llmebench.models import FastChatModel +from llmebench.tasks import DemographyNameInfoTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": NameInfoDataset, + "task": DemographyNameInfoTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "gb", + "us", + "cl", + "fr", + "ru", + "pl", + "in", + "it", + "kr", + "gh", + "ca", + "sa", + "at", + "de", + "cn", + "br", + "dk", + "se", + "bd", + "cu", + "jp", + "be", + "es", + "co", + "id", + "iq", + "pk", + "tr", + "il", + "ch", + "ar", + "ro", + "nl", + "ps", + "ug", + "ir", + "cg", + "do", + "ee", + "tn", + "gr", + "np", + "ie", + "sy", + "hu", + "eg", + "ma", + "ve", + "ph", + "no", + "bg", + "si", + "ke", + "au", + "et", + "py", + "af", + "pt", + "th", + "bo", + "mx", + "lb", + "za", + "fi", + "hr", + "vn", + "ly", + "nz", + "qa", + "kh", + "ci", + "ng", + "sg", + "cm", + "dz", + "tz", + "ae", + "pe", + "az", + "lu", + "ec", + "cz", + "ua", + "uy", + "sd", + "ao", + "my", + "lv", + "kw", + "tw", + "bh", + "lk", + "ye", + "cr", + "jo", + "pa", + "om", + "uz", + "by", + "kz", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f"Label the country of the following person 'name'. Write ONLY the country code in ISO 3166-1 alpha-2 format.\n\n" + f"name: {input_sample}\n" + f"country: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label_list = config()["model_args"]["class_labels"] + + if "name: " in label: + label_fixed = label.replace("name: ", "").lower() + elif label.lower() == "uae": + label_fixed = "ae" + elif label.lower() in label_list: + label_fixed = label.lower() + elif ( + "I'm sorry, but I cannot predict the country" in label + or "I cannot predict the country" in label + ): + label_fixed = None + else: + label_fixed = None + + return label_fixed diff --git a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..faf29cdf --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_JAIS13b_ZeroShot.py @@ -0,0 +1,45 @@ +from llmebench.datasets import AdultDataset +from llmebench.models import FastChatModel +from llmebench.tasks import AdultTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": AdultDataset, + "task": AdultTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["ADULT", "NOT_ADULT"], + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Given the following tweet, label it as "ADULT" or "NOT_ADULT" based on the content of the tweet.\n\n' + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + out = response["choices"][0]["message"]["content"].replace("label: ", "") + j = out.find(".") + if j > 0: + out = out[0:j] + return out diff --git a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..6266751d --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_JAIS13b_ZeroShot.py @@ -0,0 +1,72 @@ +from llmebench.datasets import CT22AttentionworthyDataset +from llmebench.models import FastChatModel +from llmebench.tasks import AttentionworthyTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": CT22AttentionworthyDataset, + "task": AttentionworthyTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "yes_discusses_action_taken", + "harmful", + "yes_discusses_cure", + "yes_asks_question", + "no_not_interesting", + "yes_other", + "yes_blame_authorities", + "yes_contains_advice", + "yes_calls_for_action", + ], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ( + f'Annotate "tweet" into one of the following categories: yes_discusses_action_taken, harmful, yes_discusses_cure, yes_asks_question, no_not_interesting, yes_other, yes_blame_authorities, yes_contains_advice, yes_calls_for_action\n\n' + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = ( + label.lower() + .replace(" - ", ", ") + .replace(",", "") + .replace(".", "") + .replace("label:", "") + ) + label = label.strip() + # label = re.sub("\s+", "_", label) + if label.startswith("no"): + label_fixed = "no_not_interesting" + elif label == "yes_discusses_covid-19_vaccine_side_effects": + label_fixed = "yes_discusses_cure" + elif label == "yes_harmful": + label_fixed = "harmful" + elif label.startswith("yes"): + label_fixed = label + + return label_fixed diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..54e10e93 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot.py @@ -0,0 +1,61 @@ +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import FastChatModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ( + f'Annotate the "tweet" into "one" of the following categories: checkworthy or not_checkworthy\n\n' + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + if "label: " in label: + arr = label.split("label: ") + label = arr[1].strip() + + if label == "checkworthy" or label == "Checkworthy": + label_fixed = "1" + elif label == "Not_checkworthy." or label == "not_checkworthy": + label_fixed = "0" + elif "not_checkworthy" in label or "label: not_checkworthy" in label: + label_fixed = "0" + elif "checkworthy" in label or "label: checkworthy" in label: + label_fixed = "1" + else: + label_fixed = None + + return label_fixed diff --git a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..76283eea --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_JAIS13b_ZeroShot.py @@ -0,0 +1,55 @@ +from llmebench.datasets import CT22ClaimDataset +from llmebench.models import FastChatModel +from llmebench.tasks import ClaimDetectionTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": CT22ClaimDataset, + "task": ClaimDetectionTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ( + f"Given the following tweet, please identify if it contains a claim. If it does, annotate 'yes', if it does not, annotate 'no'\n\n" + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + if "label: " in label: + arr = label.split("label: ") + label = arr[1].strip() + + if label == "yes" or label == "the sentence contains a factual claim": + label_fixed = "1" + if label == "no": + label_fixed = "0" + + return label_fixed diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..d2a4bd7f --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_JAIS13b_ZeroShot.py @@ -0,0 +1,62 @@ +from llmebench.datasets import ANSFactualityDataset +from llmebench.models import FastChatModel +from llmebench.tasks import FactualityTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": ANSFactualityDataset, + "task": FactualityTask, + "model": FastChatModel, + "model_args": { + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = ( + "Detect whether the information in the sentence is factually true or false. " + "Answer only by true or false.\n\n" + + "Sentence: " + + input_sample + + "\nlabel: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + input_label = response["choices"][0]["message"]["content"] + input_label = input_label.replace(".", "").strip().lower() + + if ( + "true" in input_label + or "label: 1" in input_label + or "label: yes" in input_label + ): + pred_label = "true" + elif ( + "false" in input_label + or "label: 0" in input_label + or "label: no" in input_label + ): + pred_label = "false" + else: + print("label problem!! " + input_label) + pred_label = None + + return pred_label diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py index 0fd76530..a21c5d98 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_BLOOMZ_ZeroShot.py @@ -8,6 +8,7 @@ def metadata(): "author": "Arabic Language Technologies, QCRI, HBKU", "model": "bloomz-176b (8bit quantized)", "description": "Locally hosted BLOOMZ 176b model (8 bit quantized version) using the Petals.", + "scores": {"Weighted-F1": "0.749"}, } @@ -29,8 +30,7 @@ def prompt(input_sample): input_sample = arr[:1000] prompt_string = ( - f"Classify following the tweet as yes or no.\n" - f"Provide only label.\n\n" + f"Does the following tweet contain a factually correct claim or not? Answer only by yes or no.\n\n" f"tweet: {input_sample}\n" f"label: \n" ) @@ -46,13 +46,26 @@ def post_process(response): label = label.replace("", "") label = label.lower() - if label.startswith("I am unable to verify".lower()) or label.startswith( - "I am unable to categorize".lower() + if ( + label.startswith("i am unable to verify") + or label.startswith("i am unable to categorize") + or label.startswith("i cannot") + or "cannot" in label ): label_fixed = None - elif "label: incorrect" in label or "incorrect" in label or label == "no": + elif ( + "label: incorrect" in label + or "incorrect" in label + or label == "no" + or label == "لا" + ): label_fixed = "no" - elif "label: correct" in label or "correct" in label or label == "yes": + elif ( + "label: correct" in label + or "correct" in label + or "yes" in label + or "نعم" in label + ): label_fixed = "yes" else: label_fixed = None diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT35_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT35_ZeroShot.py index 3f3a66b3..9fe2da47 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT35_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT35_ZeroShot.py @@ -8,7 +8,7 @@ def metadata(): "author": "Arabic Language Technologies, QCRI, HBKU", "model": "gpt-35-turbo (version 0301)", "description": "GPT35 model hosted on Azure, using the Completion API. API version '2023-03-15-preview'.", - "scores": {"Weighted-F1": "0.103"}, + "scores": {"Weighted-F1": "0.393"}, } @@ -26,8 +26,8 @@ def config(): def prompt(input_sample): prompt_string = ( - f"Detect the information in the sentence as correct or incorrect. Use label as yes or no.\n\n" - f"text: {input_sample}\n" + f"Does the following tweet contain a factually correct claim or not? Answer only by yes or no.\n\n" + f"tweet: {input_sample}\n" f"label: \n" ) return { @@ -44,16 +44,27 @@ def prompt(input_sample): def post_process(response): label = response["choices"][0]["text"].lower().replace(".", "").lower() - if label.startswith("I am unable to verify".lower()) or label.startswith( - "I am unable to categorize".lower() + if ( + label.startswith("i am unable to verify") + or label.startswith("i am unable to categorize") + or label.startswith("i cannot") + or "cannot" in label ): label_fixed = None - elif "incorrect" in label or "label: no" in label: + elif ( + "label: incorrect" in label + or "incorrect" in label + or label == "no" + or label == "لا" + ): label_fixed = "no" - elif "correct" in label or "label: yes" in label: + elif ( + "label: correct" in label + or "correct" in label + or "yes" in label + or "نعم" in label + ): label_fixed = "yes" - elif "no" == label or "yes" == label: - label_fixed = label else: label_fixed = None diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py index a294c5d0..6510c4e0 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_FewShot.py @@ -8,7 +8,7 @@ def metadata(): "author": "Arabic Language Technologies, QCRI, HBKU", "model": "gpt-4-32k (version 0314)", "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.", - "scores": {"Weighted-F1": "0.497"}, + "scores": {"Weighted-F1": "0.491"}, } @@ -49,11 +49,11 @@ def few_shot_prompt(input_sample, base_prompt, examples): def prompt(input_sample, examples): - base_prompt = f'Annotate the "tweet" into one of the following categories: yes or no. Provide only label.' + base_prompt = f"Does the following tweet contain a factually correct claim or not? Answer only by yes or no." return [ { "role": "system", - "content": "You are a social media expert, a fact-checker and you can annotate tweets.", + "content": "You are an expert fact-checker.", }, { "role": "user", @@ -63,20 +63,28 @@ def prompt(input_sample, examples): def post_process(response): - label = response["choices"][0]["message"]["content"] + label = response["choices"][0]["message"]["content"].lower() if ( + label.startswith("i am unable to verify") + or label.startswith("i am unable to categorize") + or label.startswith("i cannot") + or "cannot" in label + ): + # print(label) + label_fixed = None + elif ( "label: incorrect" in label or "incorrect" in label or label == "no" - or "label: no" in label + or label == "لا" ): label_fixed = "no" elif ( "label: correct" in label or "correct" in label - or label == "yes" - or "label: yes" in label + or "yes" in label + or "نعم" in label ): label_fixed = "yes" else: diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py index ea0c0689..f90cb347 100644 --- a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_GPT4_ZeroShot.py @@ -8,7 +8,7 @@ def metadata(): "author": "Arabic Language Technologies, QCRI, HBKU", "model": "gpt-4-32k (version 0314)", "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.", - "scores": {"Weighted-F1": "0.372"}, + "scores": {"Weighted-F1": "0.485"}, } @@ -19,21 +19,21 @@ def config(): "model": OpenAIModel, "model_args": { "class_labels": ["yes", "no"], - "max_tries": 30, + "max_tries": 3, }, } def prompt(input_sample): prompt_string = ( - f'Annotate the "tweet" into one of the following categories: correct or incorrect\n\n' + f"Does the following tweet contain a factually correct claim or not? Answer only by yes or no.\n\n" f"tweet: {input_sample}\n" f"label: \n" ) return [ { "role": "system", - "content": "You are a social media expert, a fact-checker and you can annotate tweets.", # You are capable of identifying and annotating tweets correct or incorrect + "content": "You are an expert fact-checker.", # You are capable of identifying and annotating tweets correct or incorrect }, { "role": "user", @@ -43,15 +43,29 @@ def prompt(input_sample): def post_process(response): - label = response["choices"][0]["message"]["content"] + label = response["choices"][0]["message"]["content"].lower() - if label.startswith("I am unable to verify".lower()) or label.startswith( - "I am unable to categorize".lower() + if ( + label.startswith("i am unable to verify") + or label.startswith("i am unable to categorize") + or label.startswith("i cannot") + or "cannot" in label ): + # print(label) label_fixed = None - elif "label: incorrect" in label or "incorrect" in label: + elif ( + "label: incorrect" in label + or "incorrect" in label + or label == "no" + or label == "لا" + ): label_fixed = "no" - elif "label: correct" in label or "correct" in label: + elif ( + "label: correct" in label + or "correct" in label + or "yes" in label + or "نعم" in label + ): label_fixed = "yes" else: label_fixed = None diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..7addca53 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_JAIS13b_ZeroShot.py @@ -0,0 +1,54 @@ +from llmebench.datasets import COVID19FactualityDataset +from llmebench.models import FastChatModel +from llmebench.tasks import FactualityTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": COVID19FactualityDataset, + "task": FactualityTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["yes", "no"], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Annotate the "tweet" into one of the following categories: correct or incorrect\n\n' + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + if label.startswith("I am unable to verify".lower()) or label.startswith( + "I am unable to categorize".lower() + ): + label_fixed = None + elif "label: incorrect" in label or "incorrect" in label: + label_fixed = "no" + elif "label: correct" in label or "correct" in label: + label_fixed = "yes" + else: + label_fixed = None + + return label_fixed diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..3cba8362 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_JAIS13b_ZeroShot.py @@ -0,0 +1,59 @@ +from llmebench.datasets import UnifiedFCFactualityDataset +from llmebench.models import FastChatModel +from llmebench.tasks import FactualityTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": UnifiedFCFactualityDataset, + "task": FactualityTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["true", "false"], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Annotate the "text" into one of the following categories: correct or incorrect\n\n' + f"text: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"].lower() + # label_fixed = label.replace("label:", "").strip() + + if ( + label.startswith("I am unable to verify".lower()) + or label.startswith("I am unable to categorize".lower()) + or label.startswith( + "I am an AI language model and I am unable to verify".lower() + ) + ): + label_fixed = None + elif "label: incorrect" in label or "incorrect" in label: + label_fixed = "false" + elif "label: correct" in label or "correct" in label: + label_fixed = "true" + else: + label_fixed = None + + return label_fixed diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..18de6cb1 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_JAIS13b_ZeroShot.py @@ -0,0 +1,67 @@ +from llmebench.datasets import CT22HarmfulDataset +from llmebench.models import FastChatModel +from llmebench.tasks import HarmfulDetectionTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": CT22HarmfulDataset, + "task": HarmfulDetectionTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ( + f'Given the following tweet, analyze its content and identify if it could potentially cause harm to an individual, society, or an organization. Annotate the tweet as "harmful" if it has the potential to cause harm, or "not_harmful" if it does not.\n\n' + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"].lower() + + if "label: harmful" in label: + label_fixed = "1" + elif "label: not_harmful" in label: + label_fixed = "0" + elif "label: " in label: + arr = label.split("label: ") + label = arr[1].strip() + + elif label.startswith("harmful") or label.startswith("yes"): + label_fixed = "1" + + elif ( + label.startswith("no") + or label == "label: safe" + or label == "not_harmful" + or "not harmful" in label + ): + label_fixed = "0" + else: + label = label.replace(".", "").strip().lower() + label = label.replace("label:", "").strip() + label_fixed = label + + return label_fixed diff --git a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..651ea9f6 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_JAIS13b_ZeroShot.py @@ -0,0 +1,48 @@ +from llmebench.datasets import OSACT4SubtaskBDataset +from llmebench.models import FastChatModel +from llmebench.tasks import HateSpeechTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": OSACT4SubtaskBDataset, + "task": HateSpeechTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["HS", "NOT_HS"], + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = f'Given the following tweet, label it as "HS" or "NOT_HS" based on the content of the tweet: \n {input_sample}' + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + out = response["choices"][0]["message"]["content"] + j = out.find(".") + if j > 0: + out = out[0:j] + + if "not_hate_speech" in out or "no_hate_speech" in out or "NOT_HS" == out: + out = "NOT_HS" + elif "hate_speech" in out or "HS" == out: + out = "HS" + else: + out = None + return out diff --git a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..b845742a --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_JAIS13b_ZeroShot.py @@ -0,0 +1,41 @@ +from llmebench.datasets import OSACT4SubtaskADataset +from llmebench.models import FastChatModel +from llmebench.tasks import OffensiveTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": OSACT4SubtaskADataset, + "task": OffensiveTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["OFF", "NOT_OFF"], + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = f'if the following Arabic sentence is offensive, just say "OFF", otherwise, say just "NOT_OFF" without explanation: \n {input_sample}' + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + out = response["choices"][0]["message"]["content"] + j = out.find(".") + if j > 0: + out = out[0:j] + return out diff --git a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..42b81aeb --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_JAIS13b_ZeroShot.py @@ -0,0 +1,162 @@ +import re + +from llmebench.datasets import WANLP22T3PropagandaDataset +from llmebench.models import FastChatModel +from llmebench.tasks import MultilabelPropagandaTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": WANLP22T3PropagandaDataset, + "dataset_args": {"techniques_path": "classes.txt"}, + "task": MultilabelPropagandaTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "no technique", + "Smears", + "Exaggeration/Minimisation", + "Loaded Language", + "Appeal to fear/prejudice", + "Name calling/Labeling", + "Slogans", + "Repetition", + "Doubt", + "Obfuscation, Intentional vagueness, Confusion", + "Flag-waving", + "Glittering generalities (Virtue)", + "Misrepresentation of Someone's Position (Straw Man)", + "Presenting Irrelevant Data (Red Herring)", + "Appeal to authority", + "Whataboutism", + "Black-and-white Fallacy/Dictatorship", + "Thought-terminating cliché", + "Causal Oversimplification", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Label this "tweet" based on the following propaganda techniques:\n\n' + f"'no technique' , 'Smears' , 'Exaggeration/Minimisation' , 'Loaded Language' , 'Appeal to fear/prejudice' , 'Name calling/Labeling' , 'Slogans' , 'Repetition' , 'Doubt' , 'Obfuscation, Intentional vagueness, Confusion' , 'Flag-waving' , 'Glittering generalities (Virtue)' , 'Misrepresentation of Someone's Position (Straw Man)' , 'Presenting Irrelevant Data (Red Herring)' , 'Appeal to authority' , 'Whataboutism' , 'Black-and-white Fallacy/Dictatorship' , 'Thought-terminating cliché' , 'Causal Oversimplification'" + f"\nAnswer (only yes/no) in the following format: \n" + f"'Doubt': 'yes', " + f"'Smears': 'no', \n\n" + f"tweet: {input_sample}\n\n" + f"label: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def fix_label(pred_label): + if "used in this text" in pred_label: + return ["no technique"] + + labels_fixed = [] + pred_label = pred_label.replace('"', "'").split("', '") + pred_labels = [] + + for l in pred_label: + splits = l.replace(",", "").split(":") + if len(splits) > 1 and "no" in splits[1]: + continue + pred_labels.append(splits[0].replace("'", "")) + + if len(pred_labels) == 0: + return ["no technique"] + + for label in pred_labels: + label = label.replace(".", "").strip() + label = re.sub("-", " ", label) + label = label.strip().lower() + + # Handle case of single word labels like "Smears" so we just capitalize it + label_fixed = label.capitalize() + + # print(label) + if "slogan" in label: + label_fixed = "Slogans" + if "loaded" in label: + label_fixed = "Loaded Language" + if "prejudice" in label or "fear" in label or "mongering" in label: + label_fixed = "Appeal to fear/prejudice" + if "terminating" in label or "thought" in label: + label_fixed = "Thought-terminating cliché" + if "calling" in label or label == "name c": + label_fixed = "Name calling/Labeling" + if "minimisation" in label or label == "exaggeration minim": + label_fixed = "Exaggeration/Minimisation" + if "glittering" in label: + label_fixed = "Glittering generalities (Virtue)" + if "flag" in label: + label_fixed = "Flag-waving" + if "obfuscation" in label: + label_fixed = "Obfuscation, Intentional vagueness, Confusion" + if "oversimplification" in label or "causal" in label: + label_fixed = "Causal Oversimplification" + if "authority" in label: + label_fixed = "Appeal to authority" + if "dictatorship" in label or "black" in label or "white" in label: + label_fixed = "Black-and-white Fallacy/Dictatorship" + if "herring" in label or "irrelevant" in label: + label_fixed = "Presenting Irrelevant Data (Red Herring)" + if "straw" in label or "misrepresentation" in label: + label_fixed = "Misrepresentation of Someone's Position (Straw Man)" + if "whataboutism" in label: + label_fixed = "Whataboutism" + + if ( + "no propaganda" in label + or "technique" in label + or label == "" + or label == "no" + or label == "appeal to history" + or label == "appeal to emotion" + or label == "appeal to" + or label == "appeal" + or label == "appeal to author" + or label == "emotional appeal" + or "no techn" in label + or "hashtag" in label + or "theory" in label + or "specific mention" in label + or "religious" in label + or "gratitude" in label + ): + label_fixed = "no technique" + + labels_fixed.append(label_fixed) + + out_put_labels = [] + # Remove no technique label when we have other techniques for the same text + if len(labels_fixed) > 1: + for flabel in labels_fixed: + if flabel != "no technique": + out_put_labels.append(flabel) + return out_put_labels + + return labels_fixed + + +def post_process(response): + label = response["choices"][0]["message"]["content"].lower() + pred_label = fix_label(label) + + return pred_label diff --git a/assets/ar/factuality_disinformation_harmful_content/spam/Spam_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..e0342419 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_JAIS13b_ZeroShot.py @@ -0,0 +1,45 @@ +from llmebench.datasets import SpamDataset +from llmebench.models import FastChatModel +from llmebench.tasks import SpamTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": SpamDataset, + "task": SpamTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["__label__ADS", "__label__NOTADS"], + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = f"If the following sentence can be classified as spam or contains an advertisemnt, write '__label__ADS' without explnanation, otherwise write '__label__NOTADS' without explanantion.\n {input_sample}\n" + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + out = response["choices"][0]["message"]["content"] + j = out.find(".") + if j > 0: + out = out[0:j] + if "NOTADS" in out: + out = "__label__NOTADS" + elif "ADS" in out: + out = "__label__ADS" + return out diff --git a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_JAIS13b_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..9360df94 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_JAIS13b_ZeroShot.py @@ -0,0 +1,49 @@ +from llmebench.datasets import CT23SubjectivityDataset +from llmebench.models import FastChatModel +from llmebench.tasks import SubjectivityTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": CT23SubjectivityDataset, + "task": SubjectivityTask, + "model": FastChatModel, + "general_args": {"test_split": "ar/dev"}, + } + + +def prompt(input_sample): + base_prompt = ( + f"صنف الجملة إلى لاموضوعية أو موضوعية.\n\n" + f"التغريدة: {input_sample}\n" + f"التصنيف: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"].lower().replace(".", "") + + if "لاموضوعية" in label: + label_fixed = "SUBJ" + elif ( + label == "موضوعية" or label == "التصنيف: موضوعية" or "التصنيف: موضوعية" in label + ): + label_fixed = "OBJ" + else: + label_fixed = None + + return label_fixed diff --git a/assets/ar/news_categorization/ASND_JAIS13b_ZeroShot.py b/assets/ar/news_categorization/ASND_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..c37e404d --- /dev/null +++ b/assets/ar/news_categorization/ASND_JAIS13b_ZeroShot.py @@ -0,0 +1,70 @@ +from llmebench.datasets import ASNDDataset +from llmebench.models import FastChatModel +from llmebench.tasks import NewsCategorizationTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": ASNDDataset, + "task": NewsCategorizationTask, + "model": FastChatModel, + } + + +def prompt(input_sample): + base_prompt = ( + f"صنف التغريدة التالية إلى واحدة من الفئات التالية: " + f"جريمة-حرب-صراع ، روحي-ديني ، صحة ، سياسة ، حقوق-الإنسان-حرية-الصحافة ، " + f"تعليم ، أعمال-اقتصاد ، فن-ترفيه ، أخرى ، " + f"علوم-تكنولوجيا ، رياضة ، بيئة\n" + f"\nالتغريدة: {input_sample}" + f"\nالفئة: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + if "جريمة-حرب-صراع" in label or "صراع-حرب" in label: + label_fixed = "crime-war-conflict" + elif "روحي" in label or "ديني" in label: + label_fixed = "spiritual" + elif "صحة" in label: + label_fixed = "health" + elif "سياسة" in label: + label_fixed = "politics" + elif "حقوق-الإنسان-حرية-الصحافة" in label: + label_fixed = "human-rights-press-freedom" + elif "تعليم" in label: + label_fixed = "education" + elif "أعمال-و-اقتصاد" in label or "أعمال" in label or "اقتصاد" in label: + label_fixed = "business-and-economy" + elif "فن-و-ترفيه" in label or "ترفيه" in label: + label_fixed = "art-and-entertainment" + elif "أخرى" in label: + label_fixed = "others" + elif "علم-و-تكنولوجيا" in label or "علوم" in label or "تكنولوجيا" in label: + label_fixed = "science-and-technology" + elif "رياضة" in label: + label_fixed = "sports" + elif "بيئة" in label: + label_fixed = "environment" + else: + label_fixed = "others" + + return label_fixed diff --git a/assets/ar/news_categorization/SANADAkhbarona_JAIS13b_ZeroShot.py b/assets/ar/news_categorization/SANADAkhbarona_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..383d9a19 --- /dev/null +++ b/assets/ar/news_categorization/SANADAkhbarona_JAIS13b_ZeroShot.py @@ -0,0 +1,72 @@ +import random + +from llmebench.datasets import SANADAkhbaronaDataset +from llmebench.models import FastChatModel +from llmebench.tasks import NewsCategorizationTask + + +random.seed(1333) + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": SANADAkhbaronaDataset, + "task": NewsCategorizationTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "politics", + "religion", + "medical", + "sports", + "tech", + "finance", + "culture", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Categorize the news "article" into one of the following categories: politics, religion, medical, sports, tech, finance, culture\n\n' + f"article: {input_sample}\n" + f"category: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label_fixed = label.lower() + label_fixed = label_fixed.replace("category: ", "") + label_fixed = label_fixed.replace("science/physics", "tech") + label_fixed = label_fixed.replace("health/nutrition", "medical") + if "سياسة" in label or "السياسة" in label: + label_fixed = "politics" + if len(label_fixed.split("\s+")) > 1: + label_fixed = label_fixed.split("\s+")[0] + label_fixed = random.choice(label_fixed.split("/")).strip() + if "science/physics" in label_fixed: + label_fixed = label_fixed.replace("science/physics", "tech") + if label_fixed.startswith("culture"): + label_fixed = label_fixed.split("(")[0] + + label_fixed = label_fixed.replace("culture.", "culture") + + return label_fixed diff --git a/assets/ar/news_categorization/SANADAlArabiya_JAIS13b_ZeroShot.py b/assets/ar/news_categorization/SANADAlArabiya_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..68f18960 --- /dev/null +++ b/assets/ar/news_categorization/SANADAlArabiya_JAIS13b_ZeroShot.py @@ -0,0 +1,72 @@ +import random + +from llmebench.datasets import SANADAlArabiyaDataset +from llmebench.models import FastChatModel +from llmebench.tasks import NewsCategorizationTask + + +random.seed(1333) + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": SANADAlArabiyaDataset, + "task": NewsCategorizationTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "politics", + "religion", + "medical", + "sports", + "tech", + "finance", + "culture", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Categorize the news "article" into one of the following categories: politics, religion, medical, sports, tech, finance, culture\n\n' + f"article: {input_sample}\n" + f"category: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label_fixed = label.lower() + label_fixed = label_fixed.replace("category: ", "") + label_fixed = label_fixed.replace("science/physics", "tech") + label_fixed = label_fixed.replace("health/nutrition", "medical") + if "سياسة" in label or "السياسة" in label: + label_fixed = "politics" + if len(label_fixed.split("\s+")) > 1: + label_fixed = label_fixed.split("\s+")[0] + label_fixed = random.choice(label_fixed.split("/")).strip() + if "science/physics" in label_fixed: + label_fixed = label_fixed.replace("science/physics", "tech") + if label_fixed.startswith("culture"): + label_fixed = label_fixed.split("(")[0] + + label_fixed = label_fixed.replace("culture.", "culture") + + return label_fixed diff --git a/assets/ar/news_categorization/SANADAlKhaleej_JAIS13b_ZeroShot.py b/assets/ar/news_categorization/SANADAlKhaleej_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..1e167ffa --- /dev/null +++ b/assets/ar/news_categorization/SANADAlKhaleej_JAIS13b_ZeroShot.py @@ -0,0 +1,77 @@ +import random + +from llmebench.datasets import SANADAlKhaleejDataset +from llmebench.models import FastChatModel +from llmebench.tasks import NewsCategorizationTask + + +random.seed(1333) + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": SANADAlKhaleejDataset, + "task": NewsCategorizationTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "culture", + "finance", + "medical", + "politics", + "religion", + "sports", + "tech", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Categorize the news "article" into one of the following categories: culture, finance, medical, politics, religion, sports, tech\n\n' + f"article: {input_sample}\n" + f"category: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + label_list = config()["model_args"]["class_labels"] + label_fixed = label.lower() + label_fixed = label_fixed.replace("category: ", "") + label_fixed = label_fixed.replace("science/physics", "tech") + label_fixed = label_fixed.replace("health/nutrition", "medical") + + if "سياسة" in label or "السياسة" in label: + label_fixed = "politics" + + if label_fixed.strip() in label_list: + label_fixed = label_fixed.strip() + + elif "science/physics" in label_fixed: + label_fixed = label_fixed.replace("science/physics", "tech") + elif label_fixed.startswith("culture"): + label_fixed = label_fixed.split("(")[0] + label_fixed = label_fixed.replace("culture.", "culture") + elif "/" in label: + label_fixed = random.choice(label_fixed.split("/")).strip() + else: + label_fixed = None + + return label_fixed diff --git a/assets/ar/semantics/NLI/XNLI_JAIS13b_ZeroShot.py b/assets/ar/semantics/NLI/XNLI_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..022bff8d --- /dev/null +++ b/assets/ar/semantics/NLI/XNLI_JAIS13b_ZeroShot.py @@ -0,0 +1,57 @@ +from llmebench.datasets import XNLIDataset +from llmebench.models import FastChatModel +from llmebench.tasks import XNLITask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": XNLIDataset, + "task": XNLITask, + "model": FastChatModel, + } + + +def prompt(input_sample): + sent1, sent2 = input_sample.split("\t") + prompt_text = "نقدم لك جملتين تمثلان فرضيتين. مهمتك هي تصنيف الفرضية اللاحقة بالنسبة للفرضية المسبقة تبعاً لواحدة من هذه التصنيفات: صحيح (الفرضية اللاحقة تدل على نفس الفرضية المسبقة)، خطأ (الفرضية اللاحقة تناقض الفرضية المسبقة)، أو غير معروف (حيادي). يجب أن يقتصر ردك على واحدة من هذه التصنيفات: صحيح، خطأ، أو غير معروف." + base_prompt = ( + prompt_text + + "\nالفرضية المسبقة: " + + sent1 + + "\nالفرضية اللاحقة: " + + sent2 + + "\n" + + "التصنيف: " + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + input_label = response["choices"][0]["message"]["content"] + input_label = input_label.replace(".", "").strip().lower() + + if "غير معروف" in input_label or "حيادي" in input_label: + pred_label = "neutral" + elif "صحيح" in input_label or "تدل" in input_label: + pred_label = "entailment" + elif "خطأ" in input_label or "تناقض" in input_label: + pred_label = "contradiction" + else: + print(input_label) + pred_label = None + + return pred_label diff --git a/assets/ar/semantics/STS/Q2QSim_JAIS13b_ZeroShot.py b/assets/ar/semantics/STS/Q2QSim_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..0a06b019 --- /dev/null +++ b/assets/ar/semantics/STS/Q2QSim_JAIS13b_ZeroShot.py @@ -0,0 +1,57 @@ +from llmebench.datasets import STSQ2QDataset +from llmebench.models import FastChatModel +from llmebench.tasks import Q2QSimDetectionTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": STSQ2QDataset, + "task": Q2QSimDetectionTask, + "model": FastChatModel, + "model_args": { + "max_tries": 3, + }, + } + + +def prompt(input_sample): + q1, q2 = input_sample.split("\t") + input_sample = q1 + "\t" + q2 + base_prompt = f"Are the following two questions semantically similar (i.e., asking for similar information)? The output should be exactly in form yes or no.\n\n{input_sample}" + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + input_label = response["choices"][0]["message"]["content"] + input_label = input_label.replace(".", "").strip().lower() + pred_label = "" + + if "yes" in input_label or "label: 1" in input_label: + pred_label = "1" + if ( + input_label == "no" + or input_label.startswith("no,") + or "label: 0" in input_label + or "label: no" in input_label + or "not semantically similar" in input_label + ): + pred_label = "0" + + if pred_label == "": + pred_label = None + + return pred_label diff --git a/assets/ar/semantics/STS/SemEval17T1STS_JAIS13b_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T1STS_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..f3bdcd6d --- /dev/null +++ b/assets/ar/semantics/STS/SemEval17T1STS_JAIS13b_ZeroShot.py @@ -0,0 +1,58 @@ +from llmebench.datasets import SemEval17T1STSDataset +from llmebench.models import FastChatModel +from llmebench.tasks import STSTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": SemEval17T1STSDataset, + "task": STSTask, + "model": FastChatModel, + "model_args": { + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f"Given two sentences, produce a continuous valued similarity score on a " + f"scale from 0 to 5, with 0 indicating that the semantics of the sentences are " + f"completely independent and 5 signifying semantic equivalence. The output " + f"should be exactly in form Similarity score =. \n{input_sample}" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + raw_response = response["choices"][0]["message"]["content"] + + if "Similarity score =" in raw_response: + pred_num = ( + raw_response.split("Similarity score = ")[1] + .strip() + .split(" ")[0] + .rstrip(".") + ) + score = float(pred_num) + else: + try: + pred_sum = float(raw_response) + score = pred_sum + except Exception as e: + score = None + + return score diff --git a/assets/ar/semantics/STS/SemEval17T2STS_JAIS13b_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T2STS_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..9f377eb3 --- /dev/null +++ b/assets/ar/semantics/STS/SemEval17T2STS_JAIS13b_ZeroShot.py @@ -0,0 +1,58 @@ +from llmebench.datasets import SemEval17T2STSDataset +from llmebench.models import FastChatModel +from llmebench.tasks import STSTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": SemEval17T2STSDataset, + "task": STSTask, + "model": FastChatModel, + "model_args": { + "max_tries": 3, + }, + } + + +def prompt(input_sample): + bsae_prompt = ( + f"Given two sentences, produce a continuous valued similarity score on a " + f"scale from 0 to 5, with 0 indicating that the semantics of the sentences are " + f"completely independent and 5 signifying semantic equivalence. The output " + f"should be exactly in form Similarity score =. \n{input_sample}" + ) + return [ + { + "role": "user", + "content": bsae_prompt, + }, + ] + + +def post_process(response): + raw_response = response["choices"][0]["message"]["content"] + + if "Similarity score =" in raw_response: + pred_num = ( + raw_response.split("Similarity score = ")[1] + .strip() + .split(" ")[0] + .rstrip(".") + ) + score = float(pred_num) + else: + try: + pred_sum = float(raw_response) + score = pred_sum + except Exception as e: + score = None + + return score diff --git a/assets/ar/sentiment_emotion_others/emotion/Emotion_Jais13b_FewShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_Jais13b_FewShot.py new file mode 100644 index 00000000..461b1636 --- /dev/null +++ b/assets/ar/sentiment_emotion_others/emotion/Emotion_Jais13b_FewShot.py @@ -0,0 +1,92 @@ +from llmebench.datasets import EmotionDataset +from llmebench.models import FastChatModel +from llmebench.tasks import EmotionTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat. 3-shot results.", + "scores": {"Jaccard similarity": "0.1001005"}, + } + + +def config(): + return { + "dataset": EmotionDataset, + "task": EmotionTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "anger", + "disgust", + "fear", + "joy", + "love", + "optimism", + "pessimism", + "sadness", + "surprise", + "trust", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample, examples): + base_prompt = f"Predict all the possible emotions in the following Arabic sentence without explanation and put them in a Python list. List of emotions is: anger, anticipation, disgust, fear, joy, love, optimism, pessimism, sadness, surprise, and trust.\n " + + return [ + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + for example in examples: + # Found chatgpt confused when using 0 and 1 in the prompt + label_list = ", ".join(map(str, example["label"])) + out_prompt = ( + out_prompt + "Sentence: " + example["input"] + "\n" + label_list + "\n\n" + ) + + # Append the sentence we want the model to predict for but leave the Label blank + out_prompt = out_prompt + "Sentence: " + input_sample + "\nlabel: \n" + + return out_prompt + + +emotions_positions = { + "anger": 0, + "anticipation": 1, + "disgust": 2, + "fear": 3, + "joy": 4, + "love": 5, + "optimism": 6, + "pessimism": 7, + "sadness": 8, + "surprise": 9, + "trust": 10, +} + + +def emotions_array(labels): + labels_arr = [] + for x, y in emotions_positions.items(): + v = 0 + if x.lower() in labels: + v = 1 + labels_arr.append(v) + return labels_arr + + +def post_process(response): + out = emotions_array(response["choices"][0]["message"]["content"]) + + return out diff --git a/assets/ar/sentiment_emotion_others/emotion/Emotion_Jais13b_ZeroShot.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_Jais13b_ZeroShot.py new file mode 100644 index 00000000..cfddddb1 --- /dev/null +++ b/assets/ar/sentiment_emotion_others/emotion/Emotion_Jais13b_ZeroShot.py @@ -0,0 +1,80 @@ +from llmebench.datasets import EmotionDataset +from llmebench.models import FastChatModel +from llmebench.tasks import EmotionTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + "scores": {"Jaccard similarity": "0.16779"}, + } + + +def config(): + return { + "dataset": EmotionDataset, + "task": EmotionTask, + "model": FastChatModel, + "model_args": { + "class_labels": [ + "anger", + "disgust", + "fear", + "joy", + "love", + "optimism", + "pessimism", + "sadness", + "surprise", + "trust", + ], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f"Predict all the possible emotions in the following Arabic sentence without explanation and put them in a Python list. List of emotions is: anger, anticipation, disgust, fear, joy, love, optimism, pessimism, sadness, surprise, and trust.\n " + f"Sentence: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +emotions_positions = { + "anger": 0, + "anticipation": 1, + "disgust": 2, + "fear": 3, + "joy": 4, + "love": 5, + "optimism": 6, + "pessimism": 7, + "sadness": 8, + "surprise": 9, + "trust": 10, +} + + +def emotions_array(labels): + labels_arr = [] + for x, y in emotions_positions.items(): + v = 0 + if x.lower() in labels: + v = 1 + labels_arr.append(v) + return labels_arr + + +def post_process(response): + out = emotions_array(response["choices"][0]["message"]["content"]) + + return out diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_JAIS13b_ZeroShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..63f6a77f --- /dev/null +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_JAIS13b_ZeroShot.py @@ -0,0 +1,60 @@ +from llmebench.datasets import ArSarcasm2Dataset +from llmebench.models import FastChatModel +from llmebench.tasks import SarcasmTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": ArSarcasm2Dataset, + "task": SarcasmTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["TRUE", "FALSE"], + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Predict whether the following "tweet" is sarcastic. Return "yes" if the tweet is sarcastic ' + f'and "no" if the tweet is not sarcastic. Provide only label.\n\ntweet: {input_sample} \n' + f"label: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + out = response["choices"][0]["message"]["content"] + out = out.strip().lower() + + if "i apologize" in out: + return None + + j = out.find("label:") + if j > 0: + out = out[j + len("label:") :] + else: + j = out.find(" is:\n\n") + if j > 0: + out = out[j + len(" is:\n\n") :] + out = out.strip().title() + if out.lower() == "yes": + return "TRUE" + elif out.lower() == "no": + return "FALSE" + return None diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_JAIS13b_ZeroShot.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..1bd4f1f3 --- /dev/null +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_JAIS13b_ZeroShot.py @@ -0,0 +1,60 @@ +from llmebench.datasets import ArSarcasmDataset +from llmebench.models import FastChatModel +from llmebench.tasks import SarcasmTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + } + + +def config(): + return { + "dataset": ArSarcasmDataset, + "task": SarcasmTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["TRUE", "FALSE"], + "max_tries": 3, + }, + } + + +def prompt(input_sample): + base_prompt = ( + f'Predict whether the following "tweet" is sarcastic. Return "yes" if the tweet is sarcastic ' + f'and "no" if the tweet is not sarcastic. Provide only label.\n\ntweet: {input_sample} \n' + f"label: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + out = response["choices"][0]["message"]["content"] + out = out.strip().lower() + + if "i apologize" in out: + return None + + j = out.find("label:") + if j > 0: + out = out[j + len("label:") :] + else: + j = out.find(" is:\n\n") + if j > 0: + out = out[j + len(" is:\n\n") :] + out = out.strip().title() + if out.lower() == "yes": + return "TRUE" + elif out.lower() == "no": + return "FALSE" + return None diff --git a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_JAIS13b_ZeroShot.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_JAIS13b_ZeroShot.py new file mode 100644 index 00000000..6126cdb9 --- /dev/null +++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_JAIS13b_ZeroShot.py @@ -0,0 +1,53 @@ +from llmebench.datasets import ArSASDataset +from llmebench.models import FastChatModel +from llmebench.tasks import SentimentTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + "scores": {"Macro-F1": "0.304"}, + } + + +def config(): + return { + "dataset": ArSASDataset, + "task": SentimentTask, + "model": FastChatModel, + } + + +def prompt(input_sample): + base_prompt = ( + f'Classify the sentiment of the following sentence as "Positive", "Negative", "Neutral" or "Mixed". Output only the label and nothing else.\n' + f"Sentence: {input_sample}\n" + f"Label: " + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + out = response["choices"][0]["message"]["content"] + out = out.strip().lower() + + if "i apologize" in out: + return None + + j = out.find("label:") + if j > 0: + out = out[j + len("label:") :] + else: + j = out.find(" is:\n\n") + if j > 0: + out = out[j + len(" is:\n\n") :] + out = out.strip().title() + return out diff --git a/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Jais13b_FewShot.py b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Jais13b_FewShot.py new file mode 100644 index 00000000..815fef8b --- /dev/null +++ b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Jais13b_FewShot.py @@ -0,0 +1,95 @@ +from llmebench.datasets import ANSStanceDataset +from llmebench.models import FastChatModel +from llmebench.tasks import StanceTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b", + "description": "Locally hosted Jais-13b-chat model using FastChat. 5-shot", + "scores": {"Macro-F1": "0.5272154156628485"}, + } + + +def config(): + return { + "dataset": ANSStanceDataset, + "task": StanceTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["agree", "disagree"], + "max_tries": 30, + }, + } + + +def prompt(input_sample, examples): + ref_s = input_sample["sentence_1"] + claim = input_sample["sentence_2"] + base_prompt = ( + f"Given a reference sentence and a claim, predict whether the claim agrees or disagrees with the reference sentence. Reply only using 'agree', 'disagree', or use 'other' if the sentence and claim are unrelated." + f"\n\n" + f"reference sentence: {ref_s}" + f"\nclaim: {claim}" + f"\nlabel: \n" + ) + + return [ + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + for example in examples: + ref_s = example["input"]["sentence_1"] + claim = example["input"]["sentence_2"] + + out_prompt = ( + out_prompt + + "reference sentence: " + + ref_s + + "\nclaim: " + + claim + + "\nlabel: " + + example["label"] + + "\n\n" + ) + + # Append the sentence we want the model to predict for but leave the label blank + + ref_s = input_sample["sentence_1"] + claim = input_sample["sentence_2"] + + out_prompt = ( + out_prompt + + "reference sentence: " + + ref_s + + "\nclaim: " + + claim + + "\nlabel: \n" + ) + + return out_prompt + + +def post_process(response): + out = response["choices"][0]["message"]["content"] + out = out.strip().lower() + + if "i apologize" in out: + return None + + j = out.find("label:") + if j > 0: + out = out[j + len("label:") :] + else: + j = out.find(" is:\n\n") + if j > 0: + out = out[j + len(" is:\n\n") :] + out = out.strip().title() + return out.lower() diff --git a/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Jais13b_ZeroShot.py b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Jais13b_ZeroShot.py new file mode 100644 index 00000000..89e3c00c --- /dev/null +++ b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Jais13b_ZeroShot.py @@ -0,0 +1,61 @@ +from llmebench.datasets import ANSStanceDataset +from llmebench.models import FastChatModel +from llmebench.tasks import StanceTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + "scores": {"Macro-F1": "0.465626102292769"}, + } + + +def config(): + return { + "dataset": ANSStanceDataset, + "task": StanceTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["agree", "disagree"], + "max_tries": 30, + }, + } + + +def prompt(input_sample): + ref_s = input_sample["sentence_1"] + claim = input_sample["sentence_2"] + base_prompt = ( + f"Given a reference sentence and a claim, predict whether the claim agrees or disagrees with the reference sentence. Reply only using 'agree', 'disagree', or use 'other' if the sentence and claim are unrelated." + f"\n\n" + f"reference sentence: {ref_s}" + f"\nclaim: {claim}" + f"\nlabel: \n" + ) + + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + out = response["choices"][0]["message"]["content"] + out = out.strip().lower() + + if "i apologize" in out: + return None + + j = out.find("label:") + if j > 0: + out = out[j + len("label:") :] + else: + j = out.find(" is:\n\n") + if j > 0: + out = out[j + len(" is:\n\n") :] + out = out.strip().title() + return out.lower() diff --git a/docs/tutorials/adding_model.md b/docs/tutorials/adding_model.md index e4231859..58a2c218 100644 --- a/docs/tutorials/adding_model.md +++ b/docs/tutorials/adding_model.md @@ -1,7 +1,7 @@ # Adding Model Provider -Implementing a model to use for benchmarking can be done by defining a model provider that is hosting that model. . Check first if the model provider you are trying to access a model thorugh has an implementation in `llmebench/models`. If not, implement a new model provider module (e.g. `llmebench/models/FastChat.py`), which implements a class (e.g. `FastChatModel`) which subclasses `ModelBase`. See an existing model provider module for inspiration. Each new model class requires implementing three functions: +Implementing a model to use for benchmarking can be done by defining a model provider that is hosting that model. . Check first if the model provider you are trying to access a model through has an implementation in `llmebench/models`. If not, implement a new model provider module (e.g. `llmebench/models/FastChat.py`), which implements a class (e.g. `FastChatModel`) which subclasses `ModelBase`. See an existing model provider module for inspiration. Each new model class requires implementing three functions: ```python class NewModel(ModelBase): @@ -20,6 +20,6 @@ class NewModel(ModelBase): # part of the model response that contains the answer to the prompt ``` -**Note:** Further details on paramters and role for each function can be found [here](https://github.com/qcri/LLMeBench/blob/main/llmebench/models/model_base.py). +**Note:** Further details on parameters and role for each function can be found [here](https://github.com/qcri/LLMeBench/blob/main/llmebench/models/model_base.py). Once the `Model` is implemented, export it in `llmebench/models/__init__.py`. diff --git a/llmebench/datasets/ANSFactuality.py b/llmebench/datasets/ANSFactuality.py index 72724643..e69481d9 100644 --- a/llmebench/datasets/ANSFactuality.py +++ b/llmebench/datasets/ANSFactuality.py @@ -26,8 +26,8 @@ def metadata(): "link": "https://github.com/latynt/ans", "download_url": "https://github.com/latynt/ans/archive/refs/heads/master.zip", "splits": { - "test": "claim/test.csv", - "train": "claim/train.csv", + "test": "ans-master/data/claim/test.csv", + "train": "ans-master/data/claim/train.csv", }, "task_type": TaskType.Classification, "class_labels": ["true", "false"], diff --git a/llmebench/datasets/ANSStance.py b/llmebench/datasets/ANSStance.py index 4fc88125..56be9979 100644 --- a/llmebench/datasets/ANSStance.py +++ b/llmebench/datasets/ANSStance.py @@ -26,8 +26,8 @@ def metadata(): "link": "https://github.com/latynt/ans", "download_url": "https://github.com/latynt/ans/archive/refs/heads/master.zip", "splits": { - "test": "stance/test.csv", - "train": "stance/train.csv", + "test": "ans-master/data/stance/test.csv", + "train": "ans-master/data/stance/train.csv", }, "task_type": TaskType.Classification, "class_labels": ["agree", "disagree"], diff --git a/tests/datasets/test_download_and_caching.py b/tests/datasets/test_download_and_caching.py index a2b8287f..db5f8904 100644 --- a/tests/datasets/test_download_and_caching.py +++ b/tests/datasets/test_download_and_caching.py @@ -37,7 +37,7 @@ def load_data(self, data_path): class TestDatasetAutoDownload(unittest.TestCase): @classmethod def setUpClass(cls): - cls.httpd = SignalingHTTPServer(("", 0), ArchiveHandler) + cls.httpd = SignalingHTTPServer(("127.0.0.1", 0), ArchiveHandler) cls.port = cls.httpd.server_address[1] cls.test_server = threading.Thread(target=cls.httpd.serve_forever, daemon=True) @@ -76,7 +76,7 @@ def test_auto_download_zip(self): self.assertTrue( dataset.download_dataset( data_dir=data_dir.name, - download_url=f"http://localhost:{self.port}/Mock.zip", + download_url=f"http://127.0.0.1:{self.port}/Mock.zip", ) ) @@ -92,7 +92,7 @@ def test_auto_download_tar(self): self.assertTrue( dataset.download_dataset( data_dir=data_dir.name, - download_url=f"http://localhost:{self.port}/Mock.tar", + download_url=f"http://127.0.0.1:{self.port}/Mock.tar", ) ) @@ -108,7 +108,7 @@ def test_auto_download_tar_gz(self): self.assertTrue( dataset.download_dataset( data_dir=data_dir.name, - download_url=f"http://localhost:{self.port}/Mock.tar.gz", + download_url=f"http://127.0.0.1:{self.port}/Mock.tar.gz", ) ) @@ -124,7 +124,7 @@ def test_auto_download_tar_bz2(self): self.assertTrue( dataset.download_dataset( data_dir=data_dir.name, - download_url=f"http://localhost:{self.port}/Mock.tar.bz2", + download_url=f"http://127.0.0.1:{self.port}/Mock.tar.bz2", ) ) @@ -140,7 +140,7 @@ def test_auto_download_tar_xz(self): self.assertTrue( dataset.download_dataset( data_dir=data_dir.name, - download_url=f"http://localhost:{self.port}/Mock.tar.xz", + download_url=f"http://127.0.0.1:{self.port}/Mock.tar.xz", ) ) @@ -155,7 +155,7 @@ def test_auto_download_default_url(self): dataset = MockDataset(data_dir=data_dir_path) self.assertTrue( dataset.download_dataset( - data_dir=data_dir.name, default_url=f"http://localhost:{self.port}/" + data_dir=data_dir.name, default_url=f"http://127.0.0.1:{self.port}/" ) ) @@ -175,7 +175,7 @@ def test_auto_download_metadata_url(self): class MockDatasetWithDownloadURL(MockDataset): def metadata(): - return {"download_url": f"http://localhost:{self.port}/Mock.zip"} + return {"download_url": f"http://127.0.0.1:{self.port}/Mock.zip"} dataset = MockDatasetWithDownloadURL(data_dir=data_dir_path) self.assertTrue(dataset.download_dataset(data_dir=data_dir.name)) @@ -197,7 +197,7 @@ def test_auto_download_non_existent(self): class MockDatasetWithDownloadURL(MockDataset): def metadata(): return { - "download_url": f"http://localhost:{self.port}/InvalidDataset.zip" + "download_url": f"http://127.0.0.1:{self.port}/InvalidDataset.zip" } dataset = MockDatasetWithDownloadURL(data_dir=data_dir_path)