diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py
new file mode 100755
index 00000000..4c1272eb
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Arabic.py
@@ -0,0 +1,92 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "اليك بعض الامثلة:\n\n"
+    for index, example in enumerate(examples):
+        label = "لا" if example["label"] == "0" else "نعم"
+
+        out_prompt = (
+            out_prompt
+            + "مثال "
+            + str(index)
+            + ":"
+            + "\n"
+            + "التغريدة: "
+            + example["input"]
+            + "التصنيف: "
+            + label
+            + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ نعم أو لا. قدم التصنيف فقط دون شرح.'
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip().lower()
+
+    if (
+        "لا_يستحق_التحقق" in label
+        or "لا يستحق التحقق" in label
+        or "ليس يستحق التحقق" in label
+        or "لا تستحق التحقق" in label
+        or "no" in label
+        or "لا" in label
+        or "not" in label
+    ):
+        return "0"
+    elif (
+        "yes" in label
+        or "نعم" in label
+        or "يستحق التحقق" in label
+        or "checkworthy" in label
+    ):
+        return "1"
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py
new file mode 100755
index 00000000..725b9ead
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_English.py
@@ -0,0 +1,92 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "Here are some examples:\n\n"
+    for index, example in enumerate(examples):
+        label = "no" if example["label"] == "0" else "yes"
+
+        out_prompt = (
+            out_prompt
+            + "Example "
+            + str(index)
+            + ":"
+            + "\n"
+            + "tweet: "
+            + example["input"]
+            + "\nlabel: "
+            + label
+            + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = f'Annotate the "tweet" into "one" of the following categories: checkworthy or not_checkworthy. Provide only label.'
+    return [
+        {
+            "role": "system",
+            "content": "You can analyze and classify tweets.",
+        },
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip().lower()
+
+    if (
+        "لا_يستحق_التحقق" in label
+        or "لا يستحق التحقق" in label
+        or "ليس يستحق التحقق" in label
+        or "لا تستحق التحقق" in label
+        or "no" in label
+        or "لا" in label
+        or "not" in label
+    ):
+        return "0"
+    elif (
+        "yes" in label
+        or "نعم" in label
+        or "يستحق التحقق" in label
+        or "checkworthy" in label
+    ):
+        return "1"
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py
new file mode 100755
index 00000000..ed5a651c
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_FewShot_Mixed.py
@@ -0,0 +1,91 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "اليك بعض الامثلة:\n\n"
+    for index, example in enumerate(examples):
+        label = "not_checkworthy" if example["label"] == "0" else "checkworthy"
+
+        out_prompt = (
+            out_prompt
+            + "مثال "
+            + str(index)
+            + ":"
+            + "\n"
+            + "التغريدة: "
+            + example["input"]
+            + "\التصنيف: "
+            + label
+            + "\n\n"
+        )
+
+    # Append the sentence we want the model to predict for but leave the Label blank
+    out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = 'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ checkworthy أو not_checkworthy'
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+    label = label.replace("label:", "").strip().lower()
+
+    if (
+        "لا_يستحق_التحقق" in label
+        or "لا يستحق التحقق" in label
+        or "ليس يستحق التحقق" in label
+        or "لا تستحق التحقق" in label
+        or "no" in label
+        or "لا" in label
+        or "not" in label
+    ):
+        return "0"
+    elif (
+        "yes" in label
+        or "نعم" in label
+        or "يستحق التحقق" in label
+        or "checkworthy" in label
+    ):
+        return "1"
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py
new file mode 100755
index 00000000..8b7cc4e4
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Arabic.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+        "scores": {"F1 (POS)": "0.560"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ نعم أو لا. قدم التصنيف فقط دون شرح.\n\n'
+        f"التغريدة: {input_sample}\n"
+        f"التصنيف: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip().lower()
+
+    if (
+        "لا_يستحق_التحقق" in label
+        or "لا يستحق التحقق" in label
+        or "ليس يستحق التحقق" in label
+        or "لا تستحق التحقق" in label
+        or "no" in label
+        or "لا" in label
+        or "not" in label
+    ):
+        return "0"
+    elif (
+        "yes" in label
+        or "نعم" in label
+        or "يستحق التحقق" in label
+        or "checkworthy" in label
+    ):
+        return "1"
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py
new file mode 100755
index 00000000..435da81f
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_English.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+        "scores": {"F1 (POS)": "0.560"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f'Does the following "tweet" contain a factual claim that is worth fact-checing? return checkworthy if it does or not_checkworthy otherwise. Return only label.\n\n'
+        f"tweet: {input_sample}\n"
+        f"label: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "You can analyze and classify tweets.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip().lower()
+
+    if (
+        "لا_يستحق_التحقق" in label
+        or "لا يستحق التحقق" in label
+        or "ليس يستحق التحقق" in label
+        or "لا تستحق التحقق" in label
+        or "no" in label
+        or "لا" in label
+        or "not" in label
+    ):
+        return "0"
+    elif (
+        "yes" in label
+        or "نعم" in label
+        or "يستحق التحقق" in label
+        or "checkworthy" in label
+    ):
+        return "1"
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py
new file mode 100755
index 00000000..7117e701
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_GPT4_ZeroShot_Mixed.py
@@ -0,0 +1,71 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+        "scores": {"F1 (POS)": "0.560"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": OpenAIModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f'هل تحتوي هذه "التغريدة" على ادعاء يستحق التحقق منه؟ أجب بـ checkworthy أو not_checkworthy\n\n'
+        f"التغريدة: {input_sample}\n"
+        f"التصنيف: \n"
+    )
+    return [
+        {
+            "role": "system",
+            "content": "أنت خبير في تحليل وتصنيف التغريدات.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip().lower()
+
+    if (
+        "لا_يستحق_التحقق" in label
+        or "لا يستحق التحقق" in label
+        or "ليس يستحق التحقق" in label
+        or "لا تستحق التحقق" in label
+        or "no" in label
+        or "لا" in label
+        or "not" in label
+    ):
+        return "0"
+    elif (
+        "yes" in label
+        or "نعم" in label
+        or "يستحق التحقق" in label
+        or "checkworthy" in label
+    ):
+        return "1"
+    else:
+        return None
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py
new file mode 100755
index 00000000..a6c16047
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Arabic.py
@@ -0,0 +1,79 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n"
+    for index, example in enumerate(examples):
+        label = "غير جديرة بالتحقق" if example["label"] == "0" else "جديرة بالتحقق"
+
+        out_prompt = (
+            out_prompt
+            + "مثال "
+            + str(index)
+            + ":"
+            + "\n"
+            + "التغريدة: "
+            + example["input"]
+            + "\التصنيف: "
+            + label
+            + "\n\n"
+        )
+
+    out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+    return [
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip()
+
+    label = label.lower()
+    if (
+        "لا" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py
new file mode 100755
index 00000000..3eccba0a
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_English.py
@@ -0,0 +1,79 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 100,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "These are some examples:\n\n"
+    for index, example in enumerate(examples):
+        label = "not checkworthy" if example["label"] == "0" else "checkworthy"
+
+        out_prompt = (
+            out_prompt
+            + "Example "
+            + str(index)
+            + ":\n"
+            + "Tweet: "
+            + example["input"]
+            + "\n"
+            + "Classification: "
+            + label
+            + "\n\n"
+        )
+
+    out_prompt = out_prompt + "Tweet: " + input_sample + "\nClassification: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n'
+    return [
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip()
+
+    label = label.lower()
+    if (
+        "لا" in label
+        or "not" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py
new file mode 100755
index 00000000..84e8f85e
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_FewShot_Mixed.py
@@ -0,0 +1,79 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n"
+    for index, example in enumerate(examples):
+        label = "not checkworthy" if example["label"] == "0" else "checkworthy"
+
+        out_prompt = (
+            out_prompt
+            + "مثال "
+            + str(index)
+            + ":"
+            + "\n"
+            + "التغريدة: "
+            + example["input"]
+            + "\nالتصنيف: "
+            + label
+            + "\n\n"
+        )
+
+    out_prompt = out_prompt + "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+    return [
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip()
+
+    label = label.lower()
+    if (
+        "لا" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py
new file mode 100755
index 00000000..99cba377
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Arabic.py
@@ -0,0 +1,54 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.560"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+    return [
+        {
+            "role": "user",
+            "content": base_prompt + input_sample,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip()
+
+    label = label.lower()
+    if (
+        "لا" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py
new file mode 100755
index 00000000..9b45ee04
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_English.py
@@ -0,0 +1,63 @@
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+        "scores": {
+            "F1 (POS)": "0.000",
+            "Acc": "0.638",
+            "comment": "jais only predicts negative class",
+        },
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    base_prompt = (
+        f'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n\n'
+        f"tweet: {input_sample}\n"
+        f"label: \n"
+    )
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip()
+
+    label = label.lower()
+    if label == "checkworthy" or label == "Checkworthy":
+        label_fixed = "1"
+    elif label == "Not_checkworthy." or label == "not_checkworthy":
+        label_fixed = "0"
+    elif "not_checkworthy" in label or "label: not_checkworthy" in label:
+        label_fixed = "0"
+    elif "checkworthy" in label or "label: checkworthy" in label:
+        label_fixed = "1"
+    else:
+        label_fixed = None
+
+    return label_fixed
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..79a64328
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_JAIS13b_ZeroShot_Mixed.py
@@ -0,0 +1,55 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.560"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": FastChatModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+    return [
+        {
+            "role": "user",
+            "content": base_prompt + input_sample,
+        },
+    ]
+
+
+def post_process(response):
+    label = response["choices"][0]["message"]["content"]
+
+    label = label.replace("label:", "").strip()
+
+    label = label.lower()
+    if (
+        "لا" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py
new file mode 100755
index 00000000..7b07189e
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Arabic.py
@@ -0,0 +1,91 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n"
+    for index, example in enumerate(examples):
+        label = "غير جديرة بالتحقق" if example["label"] == "0" else "جديرة بالتحقق"
+
+        out_prompt = (
+            out_prompt
+            + "مثال "
+            + str(index)
+            + ":"
+            + "\n"
+            + "التغريدة: "
+            + example["input"]
+            + "\التصنيف: "
+            + label
+            + "\n\n"
+        )
+
+    out_prompt = out_prompt + "التغريدة: " + input_sample + "\التصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+    return [
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+import random
+
+
+def post_process(response):
+    print(response)
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+
+    label = label.lower()
+
+    if "لا أستطيع" in label:
+        return random.choice(["0", "1"])
+    if (
+        "لا" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py
new file mode 100755
index 00000000..45651e3e
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_English.py
@@ -0,0 +1,90 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 100,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "These are some examples:\n\n"
+    for index, example in enumerate(examples):
+        label = "not checkworthy" if example["label"] == "0" else "checkworthy"
+
+        out_prompt = (
+            out_prompt
+            + "Example "
+            + str(index)
+            + ":\n"
+            + "Tweet: "
+            + example["input"]
+            + "\n"
+            + "Classification: "
+            + label
+            + "\n\n"
+        )
+
+    out_prompt = out_prompt + "Tweet: " + input_sample + "\nClassification: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n'
+    return [
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+import random
+
+
+def post_process(response):
+    print(response)
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+    label = label.lower()
+
+    if "لا أستطيع" in label or "I cannot" in label:
+        return random.choice(["0", "1"])
+    if (
+        "not" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py
new file mode 100755
index 00000000..1aa9810a
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_FewShot_Mixed.py
@@ -0,0 +1,90 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.554"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar", "fewshot": {"train_split": "ar"}},
+    }
+
+
+def few_shot_prompt(input_sample, base_prompt, examples):
+    out_prompt = base_prompt + "\n"
+    out_prompt = out_prompt + "هذه بعض الأمثلة:\n\n"
+    for index, example in enumerate(examples):
+        label = "not checkworthy" if example["label"] == "0" else "checkworthy"
+
+        out_prompt = (
+            out_prompt
+            + "مثال "
+            + str(index)
+            + ":"
+            + "\n"
+            + "التغريدة: "
+            + example["input"]
+            + "\nالتصنيف: "
+            + label
+            + "\n\n"
+        )
+
+    out_prompt = out_prompt + "التغريدة: " + input_sample + "\nالتصنيف: \n"
+
+    return out_prompt
+
+
+def prompt(input_sample, examples):
+    base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+    return [
+        {
+            "role": "user",
+            "content": few_shot_prompt(input_sample, base_prompt, examples),
+        },
+    ]
+
+
+import random
+
+
+def post_process(response):
+    print(response)
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+    label = label.lower()
+    if "لا أستطيع" in label or "I cannot" in label:
+        return random.choice(["0", "1"])
+    if (
+        "لا" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+        or "not" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py
new file mode 100755
index 00000000..c2aada9f
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_ZeroShot_Mixed.py
@@ -0,0 +1,66 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.560"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    base_prompt = ' صنف التغريدة الى "checkworthy" أو "not checkworthy". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+    return [
+        {
+            "role": "user",
+            "content": base_prompt + input_sample,
+        },
+    ]
+
+
+import random
+
+
+def post_process(response):
+    print(response)
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+    label = label.lower()
+
+    if "لا أستطيع" in label or "I cannot" in label:
+        return random.choice(["0", "1"])
+    if (
+        "لا" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py
new file mode 100755
index 00000000..5dc4b3c5
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_Arabic.py
@@ -0,0 +1,65 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.560"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    base_prompt = ' صنف التغريدة الى "جديرة بالتحقق" أو "غير جديرة بالتحقق". قم بتقديم التصنيف فقط لأخر تغريدة، لا تقدم أي تبرير إضافي.:\n'
+    return [
+        {
+            "role": "user",
+            "content": base_prompt + input_sample,
+        },
+    ]
+
+
+import random
+
+
+def post_process(response):
+    print(response)
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+    label = label.lower()
+    if "لا أستطيع" in label:
+        return random.choice(["0", "1"])
+    if (
+        "لا" in label
+        or "غير" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label
diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py
new file mode 100755
index 00000000..61fbfffe
--- /dev/null
+++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Llama3-8b_Zeroshot_English.py
@@ -0,0 +1,65 @@
+import re
+
+from llmebench.datasets import CT22CheckworthinessDataset
+from llmebench.models import AzureModel
+from llmebench.tasks import CheckworthinessTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "LLama3-8b",
+        "description": "Deployed on Azure.",
+        "scores": {"F1 (POS)": "0.560"},
+    }
+
+
+def config():
+    return {
+        "dataset": CT22CheckworthinessDataset,
+        "task": CheckworthinessTask,
+        "model": AzureModel,
+        "model_args": {
+            "class_labels": ["0", "1"],
+            "max_tries": 30,
+        },
+        "general_args": {"test_split": "ar"},
+    }
+
+
+def prompt(input_sample):
+    base_prompt = 'Classify the tweet as "checkworthy" or "not checkworthy". Provide the classification only for the last tweet, without providing any additional justification:\n'
+    return [
+        {
+            "role": "user",
+            "content": base_prompt + input_sample,
+        },
+    ]
+
+
+import random
+
+
+def post_process(response):
+    print(response)
+    if "output" in response:
+        # if "content" in response["messages"]:
+        label = response["output"].strip()
+        label = label.replace("<s>", "")
+        label = label.replace("</s>", "")
+    else:
+        print("Response .. " + str(response))
+        label = ""
+    label = label.lower()
+    if "لا أستطيع" in label or "I cannot" in label:
+        return random.choice(["0", "1"])
+    if (
+        "لا" in label
+        or "not" in label
+        or "no" in label
+        or "ليس" in label
+        or "ليست" in label
+    ):
+        return "0"
+    return "1"
+    return label