diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py
new file mode 100755
index 00000000..4c1272eb
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py
@@ -0,0 +1,92 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "gpt-4-32k (version 0314)",
+ "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "اليك بعض الامثلة:\n\n"
+ for index, example in enumerate(examples):
+ label = "لا" if example["label"] == "0" else "نعم"
+
+ out_prompt = (
+ out_prompt
+ + "مثال "
+ + str(index)
+ + ":"
+ + "\n"
+ + "التغريدة: "
+ + example["input"]
+ + "التصنيف: "
+ + label
+ + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ نعم أو لا. قدم التصنيف فقط دون شرح.'
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+ },
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip().lower()
+
+ if (
+ "لا_يستحق_التحقق" in label
+ or "لا يستحق التحقق" in label
+ or "ليس يستحق التحقق" in label
+ or "لا تستحق التحقق" in label
+ or "no" in label
+ or "لا" in label
+ or "not" in label
+ ):
+ return "0"
+ elif (
+ "yes" in label
+ or "نعم" in label
+ or "يستحق التحقق" in label
+ or "checkworthy" in label
+ ):
+ return "1"
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py
new file mode 100755
index 00000000..725b9ead
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py
@@ -0,0 +1,92 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "gpt-4-32k (version 0314)",
+ "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "Here are some examples:\n\n"
+ for index, example in enumerate(examples):
+ label = "no" if example["label"] == "0" else "yes"
+
+ out_prompt = (
+ out_prompt
+ + "Example "
+ + str(index)
+ + ":"
+ + "\n"
+ + "tweet: "
+ + example["input"]
+ + "\nlabel: "
+ + label
+ + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = f'Annotate the "tweet" into "one" of the following categories: checkworthy or not_checkworthy. Provide only label.'
+ return [
+ {
+ "role": "system",
+ "content": "You can analyze and classify tweets.",
+ },
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip().lower()
+
+ if (
+ "لا_يستحق_التحقق" in label
+ or "لا يستحق التحقق" in label
+ or "ليس يستحق التحقق" in label
+ or "لا تستحق التحقق" in label
+ or "no" in label
+ or "لا" in label
+ or "not" in label
+ ):
+ return "0"
+ elif (
+ "yes" in label
+ or "نعم" in label
+ or "يستحق التحقق" in label
+ or "checkworthy" in label
+ ):
+ return "1"
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py
new file mode 100755
index 00000000..ed5a651c
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py
@@ -0,0 +1,91 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "gpt-4-32k (version 0314)",
+ "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "اليك بعض الامثلة:\n\n"
+ for index, example in enumerate(examples):
+ label = "not_checkworthy" if example["label"] == "0" else "checkworthy"
+
+ out_prompt = (
+ out_prompt
+ + "مثال "
+ + str(index)
+ + ":"
+ + "\n"
+ + "التغريدة: "
+ + example["input"]
+ + "\التصنيف: "
+ + label
+ + "\n\n"
+ )
+
+ # Append the sentence we want the model to predict for but leave the Label blank
+ out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = 'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ checkworthy أو not_checkworthy'
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+ },
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+ label = label.replace("label:", "").strip().lower()
+
+ if (
+ "لا_يستحق_التحقق" in label
+ or "لا يستحق التحقق" in label
+ or "ليس يستحق التحقق" in label
+ or "لا تستحق التحقق" in label
+ or "no" in label
+ or "لا" in label
+ or "not" in label
+ ):
+ return "0"
+ elif (
+ "yes" in label
+ or "نعم" in label
+ or "يستحق التحقق" in label
+ or "checkworthy" in label
+ ):
+ return "1"
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py
new file mode 100755
index 00000000..8b7cc4e4
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "gpt-4-32k (version 0314)",
+ "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+ "scores": {"F1 (POS)": "0.560"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ prompt_string = (
+ f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ نعم أو لا. قدم التصنيف فقط دون شرح.\n\n'
+ f"التغريدة: {input_sample}\n"
+ f"التصنيف: \n"
+ )
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+ },
+ {
+ "role": "user",
+ "content": prompt_string,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip().lower()
+
+ if (
+ "لا_يستحق_التحقق" in label
+ or "لا يستحق التحقق" in label
+ or "ليس يستحق التحقق" in label
+ or "لا تستحق التحقق" in label
+ or "no" in label
+ or "لا" in label
+ or "not" in label
+ ):
+ return "0"
+ elif (
+ "yes" in label
+ or "نعم" in label
+ or "يستحق التحقق" in label
+ or "checkworthy" in label
+ ):
+ return "1"
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py
new file mode 100755
index 00000000..435da81f
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "gpt-4-32k (version 0314)",
+ "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+ "scores": {"F1 (POS)": "0.560"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ prompt_string = (
+ f'Does the following "tweet" contain a factual claim that is worth fact-checing? return checkworthy if it does or not_checkworthy otherwise. Return only label.\n\n'
+ f"tweet: {input_sample}\n"
+ f"label: \n"
+ )
+ return [
+ {
+ "role": "system",
+ "content": "You can analyze and classify tweets.",
+ },
+ {
+ "role": "user",
+ "content": prompt_string,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip().lower()
+
+ if (
+ "لا_يستحق_التحقق" in label
+ or "لا يستحق التحقق" in label
+ or "ليس يستحق التحقق" in label
+ or "لا تستحق التحقق" in label
+ or "no" in label
+ or "لا" in label
+ or "not" in label
+ ):
+ return "0"
+ elif (
+ "yes" in label
+ or "نعم" in label
+ or "يستحق التحقق" in label
+ or "checkworthy" in label
+ ):
+ return "1"
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py
new file mode 100755
index 00000000..7117e701
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "gpt-4-32k (version 0314)",
+ "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+ "scores": {"F1 (POS)": "0.560"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": OpenAIModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ prompt_string = (
+ f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ checkworthy أو not_checkworthy\n\n'
+ f"التغريدة: {input_sample}\n"
+ f"التصنيف: \n"
+ )
+ return [
+ {
+ "role": "system",
+ "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+ },
+ {
+ "role": "user",
+ "content": prompt_string,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip().lower()
+
+ if (
+ "لا_يستحق_التحقق" in label
+ or "لا يستحق التحقق" in label
+ or "ليس يستحق التحقق" in label
+ or "لا تستحق التحقق" in label
+ or "no" in label
+ or "لا" in label
+ or "not" in label
+ ):
+ return "0"
+ elif (
+ "yes" in label
+ or "نعم" in label
+ or "يستحق التحقق" in label
+ or "checkworthy" in label
+ ):
+ return "1"
+ else:
+ return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py
new file mode 100755
index 00000000..a6c16047
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py
@@ -0,0 +1,79 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n"
+ for index, example in enumerate(examples):
+ label = "غير جديرة بالتحقق" if example["label"] == "0" else "جديرة بالتحقق"
+
+ out_prompt = (
+ out_prompt
+ + "مثال "
+ + str(index)
+ + ":"
+ + "\n"
+ + "التغريدة: "
+ + example["input"]
+ + "\التصنيف: "
+ + label
+ + "\n\n"
+ )
+
+ out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+ return [
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip()
+
+ label = label.lower()
+ if (
+ "لا" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py
new file mode 100755
index 00000000..3eccba0a
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py
@@ -0,0 +1,79 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 100,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "These are some examples:\n\n"
+ for index, example in enumerate(examples):
+ label = "not checkworthy" if example["label"] == "0" else "checkworthy"
+
+ out_prompt = (
+ out_prompt
+ + "Example "
+ + str(index)
+ + ":\n"
+ + "Tweet: "
+ + example["input"]
+ + "\n"
+ + "Classification: "
+ + label
+ + "\n\n"
+ )
+
+ out_prompt = out_prompt + "Tweet: " + input_sample + "\nClassification: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n'
+ return [
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip()
+
+ label = label.lower()
+ if (
+ "لا" in label
+ or "not" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py
new file mode 100755
index 00000000..84e8f85e
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py
@@ -0,0 +1,79 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n"
+ for index, example in enumerate(examples):
+ label = "not checkworthy" if example["label"] == "0" else "checkworthy"
+
+ out_prompt = (
+ out_prompt
+ + "مثال "
+ + str(index)
+ + ":"
+ + "\n"
+ + "التغريدة: "
+ + example["input"]
+ + "\nالتصنيف: "
+ + label
+ + "\n\n"
+ )
+
+ out_prompt = out_prompt + "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+ return [
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip()
+
+ label = label.lower()
+ if (
+ "لا" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..99cba377
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py
@@ -0,0 +1,54 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.560"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+ return [
+ {
+ "role": "user",
+ "content": base_prompt + input_sample,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip()
+
+ label = label.lower()
+ if (
+ "لا" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py
new file mode 100755
index 00000000..9b45ee04
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py
@@ -0,0 +1,63 @@
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "Jais-13b-chat",
+ "description": "Locally hosted Jais-13b-chat model using FastChat.",
+ "scores": {
+ "F1 (POS)": "0.000",
+ "Acc": "0.638",
+ "comment": "jais only predicts negative class",
+ },
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ base_prompt = (
+ f'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n\n'
+ f"tweet: {input_sample}\n"
+ f"label: \n"
+ )
+ return [
+ {
+ "role": "user",
+ "content": base_prompt,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip()
+
+ label = label.lower()
+ if label == "checkworthy" or label == "Checkworthy":
+ label_fixed = "1"
+ elif label == "Not_checkworthy." or label == "not_checkworthy":
+ label_fixed = "0"
+ elif "not_checkworthy" in label or "label: not_checkworthy" in label:
+ label_fixed = "0"
+ elif "checkworthy" in label or "label: checkworthy" in label:
+ label_fixed = "1"
+ else:
+ label_fixed = None
+
+ return label_fixed
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..79a64328
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py
@@ -0,0 +1,55 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.560"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": FastChatModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+ return [
+ {
+ "role": "user",
+ "content": base_prompt + input_sample,
+ },
+ ]
+
+
+def post_process(response):
+ label = response["choices"][0]["message"]["content"]
+
+ label = label.replace("label:", "").strip()
+
+ label = label.lower()
+ if (
+ "لا" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py
new file mode 100755
index 00000000..7b07189e
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py
@@ -0,0 +1,91 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n"
+ for index, example in enumerate(examples):
+ label = "غير جديرة بالتحقق" if example["label"] == "0" else "جديرة بالتحقق"
+
+ out_prompt = (
+ out_prompt
+ + "مثال "
+ + str(index)
+ + ":"
+ + "\n"
+ + "التغريدة: "
+ + example["input"]
+ + "\التصنيف: "
+ + label
+ + "\n\n"
+ )
+
+ out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+ return [
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+import random
+
+
+def post_process(response):
+ print(response)
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+
+ label = label.lower()
+
+ if "لا أستطيع" in label:
+ return random.choice(["0", "1"])
+ if (
+ "لا" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py
new file mode 100755
index 00000000..45651e3e
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py
@@ -0,0 +1,90 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 100,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "These are some examples:\n\n"
+ for index, example in enumerate(examples):
+ label = "not checkworthy" if example["label"] == "0" else "checkworthy"
+
+ out_prompt = (
+ out_prompt
+ + "Example "
+ + str(index)
+ + ":\n"
+ + "Tweet: "
+ + example["input"]
+ + "\n"
+ + "Classification: "
+ + label
+ + "\n\n"
+ )
+
+ out_prompt = out_prompt + "Tweet: " + input_sample + "\nClassification: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n'
+ return [
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+import random
+
+
+def post_process(response):
+ print(response)
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+ label = label.lower()
+
+ if "لا أستطيع" in label or "I cannot" in label:
+ return random.choice(["0", "1"])
+ if (
+ "not" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py
new file mode 100755
index 00000000..1aa9810a
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py
@@ -0,0 +1,90 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.554"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+ }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+ out_prompt = base_prompt + "\n"
+ out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n"
+ for index, example in enumerate(examples):
+ label = "not checkworthy" if example["label"] == "0" else "checkworthy"
+
+ out_prompt = (
+ out_prompt
+ + "مثال "
+ + str(index)
+ + ":"
+ + "\n"
+ + "التغريدة: "
+ + example["input"]
+ + "\nالتصنيف: "
+ + label
+ + "\n\n"
+ )
+
+ out_prompt = out_prompt + "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+ return out_prompt
+
+
+def prompt(input_sample, examples):
+ base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+ return [
+ {
+ "role": "user",
+ "content": few_shot_prompt(input_sample, base_prompt, examples),
+ },
+ ]
+
+
+import random
+
+
+def post_process(response):
+ print(response)
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+ label = label.lower()
+ if "لا أستطيع" in label or "I cannot" in label:
+ return random.choice(["0", "1"])
+ if (
+ "لا" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ or "not" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..c2aada9f
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py
@@ -0,0 +1,66 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.560"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+ return [
+ {
+ "role": "user",
+ "content": base_prompt + input_sample,
+ },
+ ]
+
+
+import random
+
+
+def post_process(response):
+ print(response)
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+ label = label.lower()
+
+ if "لا أستطيع" in label or "I cannot" in label:
+ return random.choice(["0", "1"])
+ if (
+ "لا" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py
new file mode 100755
index 00000000..5dc4b3c5
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py
@@ -0,0 +1,65 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.560"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+ return [
+ {
+ "role": "user",
+ "content": base_prompt + input_sample,
+ },
+ ]
+
+
+import random
+
+
+def post_process(response):
+ print(response)
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+ label = label.lower()
+ if "لا أستطيع" in label:
+ return random.choice(["0", "1"])
+ if (
+ "لا" in label
+ or "غير" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py
new file mode 100755
index 00000000..61fbfffe
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py
@@ -0,0 +1,65 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+ return {
+ "author": "Arabic Language Technologies, QCRI, HBKU",
+ "model": "LLama3-8b",
+ "description": "Deployed on Azure.",
+ "scores": {"F1 (POS)": "0.560"},
+ }
+
+
+def config():
+ return {
+ "dataset": CT22CheckworthinessDataset,
+ "task": CheckworthinessTask,
+ "model": AzureModel,
+ "model_args": {
+ "class_labels": ["0", "1"],
+ "max_tries": 30,
+ },
+ "general_args": {"test_split": "ar"},
+ }
+
+
+def prompt(input_sample):
+ base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n'
+ return [
+ {
+ "role": "user",
+ "content": base_prompt + input_sample,
+ },
+ ]
+
+
+import random
+
+
+def post_process(response):
+ print(response)
+ if "output" in response:
+ # if "content" in response["messages"]:
+ label = response["output"].strip()
+ label = label.replace("", "")
+ label = label.replace("", "")
+ else:
+ print("Response .. " + str(response))
+ label = ""
+ label = label.lower()
+ if "لا أستطيع" in label or "I cannot" in label:
+ return random.choice(["0", "1"])
+ if (
+ "لا" in label
+ or "not" in label
+ or "no" in label
+ or "ليس" in label
+ or "ليست" in label
+ ):
+ return "0"
+ return "1"
+ return label