diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py new file mode 100755 index 00000000..4c1272eb --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py @@ -0,0 +1,92 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "gpt-4-32k (version 0314)", + "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": OpenAIModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "اليك بعض الامثلة:\n\n" + for index, example in enumerate(examples): + label = "لا" if example["label"] == "0" else "نعم" + + out_prompt = ( + out_prompt + + "مثال " + + str(index) + + ":" + + "\n" + + "التغريدة: " + + example["input"] + + "التصنيف: " + + label + + "\n\n" + ) + + # Append the sentence we want the model to predict for but leave the Label blank + out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ نعم أو لا. قدم التصنيف فقط دون شرح.' + return [ + { + "role": "system", + "content": "أنت خبير في تحليل وتصنيف التغريدات.", + }, + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip().lower() + + if ( + "لا_يستحق_التحقق" in label + or "لا يستحق التحقق" in label + or "ليس يستحق التحقق" in label + or "لا تستحق التحقق" in label + or "no" in label + or "لا" in label + or "not" in label + ): + return "0" + elif ( + "yes" in label + or "نعم" in label + or "يستحق التحقق" in label + or "checkworthy" in label + ): + return "1" + else: + return None diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py new file mode 100755 index 00000000..725b9ead --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py @@ -0,0 +1,92 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "gpt-4-32k (version 0314)", + "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": OpenAIModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "Here are some examples:\n\n" + for index, example in enumerate(examples): + label = "no" if example["label"] == "0" else "yes" + + out_prompt = ( + out_prompt + + "Example " + + str(index) + + ":" + + "\n" + + "tweet: " + + example["input"] + + "\nlabel: " + + label + + "\n\n" + ) + + # Append the sentence we want the model to predict for but leave the Label blank + out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = f'Annotate the "tweet" into "one" of the following categories: checkworthy or not_checkworthy. Provide only label.' + return [ + { + "role": "system", + "content": "You can analyze and classify tweets.", + }, + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip().lower() + + if ( + "لا_يستحق_التحقق" in label + or "لا يستحق التحقق" in label + or "ليس يستحق التحقق" in label + or "لا تستحق التحقق" in label + or "no" in label + or "لا" in label + or "not" in label + ): + return "0" + elif ( + "yes" in label + or "نعم" in label + or "يستحق التحقق" in label + or "checkworthy" in label + ): + return "1" + else: + return None diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py new file mode 100755 index 00000000..ed5a651c --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py @@ -0,0 +1,91 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "gpt-4-32k (version 0314)", + "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": OpenAIModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "اليك بعض الامثلة:\n\n" + for index, example in enumerate(examples): + label = "not_checkworthy" if example["label"] == "0" else "checkworthy" + + out_prompt = ( + out_prompt + + "مثال " + + str(index) + + ":" + + "\n" + + "التغريدة: " + + example["input"] + + "\التصنيف: " + + label + + "\n\n" + ) + + # Append the sentence we want the model to predict for but leave the Label blank + out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = 'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ checkworthy أو not_checkworthy' + return [ + { + "role": "system", + "content": "أنت خبير في تحليل وتصنيف التغريدات.", + }, + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + label = label.replace("label:", "").strip().lower() + + if ( + "لا_يستحق_التحقق" in label + or "لا يستحق التحقق" in label + or "ليس يستحق التحقق" in label + or "لا تستحق التحقق" in label + or "no" in label + or "لا" in label + or "not" in label + ): + return "0" + elif ( + "yes" in label + or "نعم" in label + or "يستحق التحقق" in label + or "checkworthy" in label + ): + return "1" + else: + return None diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py new file mode 100755 index 00000000..8b7cc4e4 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py @@ -0,0 +1,71 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "gpt-4-32k (version 0314)", + "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.", + "scores": {"F1 (POS)": "0.560"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": OpenAIModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + prompt_string = ( + f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ نعم أو لا. قدم التصنيف فقط دون شرح.\n\n' + f"التغريدة: {input_sample}\n" + f"التصنيف: \n" + ) + return [ + { + "role": "system", + "content": "أنت خبير في تحليل وتصنيف التغريدات.", + }, + { + "role": "user", + "content": prompt_string, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip().lower() + + if ( + "لا_يستحق_التحقق" in label + or "لا يستحق التحقق" in label + or "ليس يستحق التحقق" in label + or "لا تستحق التحقق" in label + or "no" in label + or "لا" in label + or "not" in label + ): + return "0" + elif ( + "yes" in label + or "نعم" in label + or "يستحق التحقق" in label + or "checkworthy" in label + ): + return "1" + else: + return None diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py new file mode 100755 index 00000000..435da81f --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py @@ -0,0 +1,71 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "gpt-4-32k (version 0314)", + "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.", + "scores": {"F1 (POS)": "0.560"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": OpenAIModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + prompt_string = ( + f'Does the following "tweet" contain a factual claim that is worth fact-checing? return checkworthy if it does or not_checkworthy otherwise. Return only label.\n\n' + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "system", + "content": "You can analyze and classify tweets.", + }, + { + "role": "user", + "content": prompt_string, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip().lower() + + if ( + "لا_يستحق_التحقق" in label + or "لا يستحق التحقق" in label + or "ليس يستحق التحقق" in label + or "لا تستحق التحقق" in label + or "no" in label + or "لا" in label + or "not" in label + ): + return "0" + elif ( + "yes" in label + or "نعم" in label + or "يستحق التحقق" in label + or "checkworthy" in label + ): + return "1" + else: + return None diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py new file mode 100755 index 00000000..7117e701 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py @@ -0,0 +1,71 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import OpenAIModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "gpt-4-32k (version 0314)", + "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.", + "scores": {"F1 (POS)": "0.560"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": OpenAIModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + prompt_string = ( + f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ checkworthy أو not_checkworthy\n\n' + f"التغريدة: {input_sample}\n" + f"التصنيف: \n" + ) + return [ + { + "role": "system", + "content": "أنت خبير في تحليل وتصنيف التغريدات.", + }, + { + "role": "user", + "content": prompt_string, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip().lower() + + if ( + "لا_يستحق_التحقق" in label + or "لا يستحق التحقق" in label + or "ليس يستحق التحقق" in label + or "لا تستحق التحقق" in label + or "no" in label + or "لا" in label + or "not" in label + ): + return "0" + elif ( + "yes" in label + or "نعم" in label + or "يستحق التحقق" in label + or "checkworthy" in label + ): + return "1" + else: + return None diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py new file mode 100755 index 00000000..a6c16047 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py @@ -0,0 +1,79 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import FastChatModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n" + for index, example in enumerate(examples): + label = "غير جديرة بالتحقق" if example["label"] == "0" else "جديرة بالتحقق" + + out_prompt = ( + out_prompt + + "مثال " + + str(index) + + ":" + + "\n" + + "التغريدة: " + + example["input"] + + "\التصنيف: " + + label + + "\n\n" + ) + + out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n' + return [ + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + label = label.lower() + if ( + "لا" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py new file mode 100755 index 00000000..3eccba0a --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py @@ -0,0 +1,79 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import FastChatModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 100, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "These are some examples:\n\n" + for index, example in enumerate(examples): + label = "not checkworthy" if example["label"] == "0" else "checkworthy" + + out_prompt = ( + out_prompt + + "Example " + + str(index) + + ":\n" + + "Tweet: " + + example["input"] + + "\n" + + "Classification: " + + label + + "\n\n" + ) + + out_prompt = out_prompt + "Tweet: " + input_sample + "\nClassification: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n' + return [ + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + label = label.lower() + if ( + "لا" in label + or "not" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py new file mode 100755 index 00000000..84e8f85e --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py @@ -0,0 +1,79 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import FastChatModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n" + for index, example in enumerate(examples): + label = "not checkworthy" if example["label"] == "0" else "checkworthy" + + out_prompt = ( + out_prompt + + "مثال " + + str(index) + + ":" + + "\n" + + "التغريدة: " + + example["input"] + + "\nالتصنيف: " + + label + + "\n\n" + ) + + out_prompt = out_prompt + "التغريدة: " + input_sample + "\nالتصنيف: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n' + return [ + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + label = label.lower() + if ( + "لا" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py new file mode 100755 index 00000000..99cba377 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py @@ -0,0 +1,54 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import FastChatModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.560"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n' + return [ + { + "role": "user", + "content": base_prompt + input_sample, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + label = label.lower() + if ( + "لا" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py new file mode 100755 index 00000000..9b45ee04 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py @@ -0,0 +1,63 @@ +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import FastChatModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "Jais-13b-chat", + "description": "Locally hosted Jais-13b-chat model using FastChat.", + "scores": { + "F1 (POS)": "0.000", + "Acc": "0.638", + "comment": "jais only predicts negative class", + }, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ( + f'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n\n' + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "user", + "content": base_prompt, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + label = label.lower() + if label == "checkworthy" or label == "Checkworthy": + label_fixed = "1" + elif label == "Not_checkworthy." or label == "not_checkworthy": + label_fixed = "0" + elif "not_checkworthy" in label or "label: not_checkworthy" in label: + label_fixed = "0" + elif "checkworthy" in label or "label: checkworthy" in label: + label_fixed = "1" + else: + label_fixed = None + + return label_fixed diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py new file mode 100755 index 00000000..79a64328 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py @@ -0,0 +1,55 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import FastChatModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.560"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": FastChatModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n' + return [ + { + "role": "user", + "content": base_prompt + input_sample, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + label = label.lower() + if ( + "لا" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py new file mode 100755 index 00000000..7b07189e --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py @@ -0,0 +1,91 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import AzureModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": AzureModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n" + for index, example in enumerate(examples): + label = "غير جديرة بالتحقق" if example["label"] == "0" else "جديرة بالتحقق" + + out_prompt = ( + out_prompt + + "مثال " + + str(index) + + ":" + + "\n" + + "التغريدة: " + + example["input"] + + "\التصنيف: " + + label + + "\n\n" + ) + + out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n' + return [ + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +import random + + +def post_process(response): + print(response) + if "output" in response: + # if "content" in response["messages"]: + label = response["output"].strip() + label = label.replace("", "") + label = label.replace("", "") + else: + print("Response .. " + str(response)) + label = "" + + label = label.lower() + + if "لا أستطيع" in label: + return random.choice(["0", "1"]) + if ( + "لا" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py new file mode 100755 index 00000000..45651e3e --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py @@ -0,0 +1,90 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import AzureModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": AzureModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 100, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "These are some examples:\n\n" + for index, example in enumerate(examples): + label = "not checkworthy" if example["label"] == "0" else "checkworthy" + + out_prompt = ( + out_prompt + + "Example " + + str(index) + + ":\n" + + "Tweet: " + + example["input"] + + "\n" + + "Classification: " + + label + + "\n\n" + ) + + out_prompt = out_prompt + "Tweet: " + input_sample + "\nClassification: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n' + return [ + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +import random + + +def post_process(response): + print(response) + if "output" in response: + # if "content" in response["messages"]: + label = response["output"].strip() + label = label.replace("", "") + label = label.replace("", "") + else: + print("Response .. " + str(response)) + label = "" + label = label.lower() + + if "لا أستطيع" in label or "I cannot" in label: + return random.choice(["0", "1"]) + if ( + "not" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py new file mode 100755 index 00000000..1aa9810a --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py @@ -0,0 +1,90 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import AzureModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.554"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": AzureModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}}, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n" + for index, example in enumerate(examples): + label = "not checkworthy" if example["label"] == "0" else "checkworthy" + + out_prompt = ( + out_prompt + + "مثال " + + str(index) + + ":" + + "\n" + + "التغريدة: " + + example["input"] + + "\nالتصنيف: " + + label + + "\n\n" + ) + + out_prompt = out_prompt + "التغريدة: " + input_sample + "\nالتصنيف: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n' + return [ + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +import random + + +def post_process(response): + print(response) + if "output" in response: + # if "content" in response["messages"]: + label = response["output"].strip() + label = label.replace("", "") + label = label.replace("", "") + else: + print("Response .. " + str(response)) + label = "" + label = label.lower() + if "لا أستطيع" in label or "I cannot" in label: + return random.choice(["0", "1"]) + if ( + "لا" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + or "not" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py new file mode 100755 index 00000000..c2aada9f --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py @@ -0,0 +1,66 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import AzureModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.560"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": AzureModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n' + return [ + { + "role": "user", + "content": base_prompt + input_sample, + }, + ] + + +import random + + +def post_process(response): + print(response) + if "output" in response: + # if "content" in response["messages"]: + label = response["output"].strip() + label = label.replace("", "") + label = label.replace("", "") + else: + print("Response .. " + str(response)) + label = "" + label = label.lower() + + if "لا أستطيع" in label or "I cannot" in label: + return random.choice(["0", "1"]) + if ( + "لا" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py new file mode 100755 index 00000000..5dc4b3c5 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py @@ -0,0 +1,65 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import AzureModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.560"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": AzureModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n' + return [ + { + "role": "user", + "content": base_prompt + input_sample, + }, + ] + + +import random + + +def post_process(response): + print(response) + if "output" in response: + # if "content" in response["messages"]: + label = response["output"].strip() + label = label.replace("", "") + label = label.replace("", "") + else: + print("Response .. " + str(response)) + label = "" + label = label.lower() + if "لا أستطيع" in label: + return random.choice(["0", "1"]) + if ( + "لا" in label + or "غير" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py new file mode 100755 index 00000000..61fbfffe --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py @@ -0,0 +1,65 @@ +import re + +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import AzureModel +from llmebench.tasks import CheckworthinessTask + + +def metadata(): + return { + "author": "Arabic Language Technologies, QCRI, HBKU", + "model": "LLama3-8b", + "description": "Deployed on Azure.", + "scores": {"F1 (POS)": "0.560"}, + } + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "task": CheckworthinessTask, + "model": AzureModel, + "model_args": { + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n' + return [ + { + "role": "user", + "content": base_prompt + input_sample, + }, + ] + + +import random + + +def post_process(response): + print(response) + if "output" in response: + # if "content" in response["messages"]: + label = response["output"].strip() + label = label.replace("", "") + label = label.replace("", "") + else: + print("Response .. " + str(response)) + label = "" + label = label.lower() + if "لا أستطيع" in label or "I cannot" in label: + return random.choice(["0", "1"]) + if ( + "لا" in label + or "not" in label + or "no" in label + or "ليس" in label + or "ليست" in label + ): + return "0" + return "1" + return label