From f48adcee6d0a9a119f6991e492ecf98edfd9ba80 Mon Sep 17 00:00:00 2001
From: Arid Hasan <18038960+AridHasan@users.noreply.github.com>
Date: Wed, 31 Jan 2024 06:51:06 -0400
Subject: [PATCH] Add Jais ZeroShot assets for semantic tasks (#256)

* Jais13b added for arabic semantics

* Format code, remove unused imports and minor metadata changes

---------

Co-authored-by: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
---
 .../ar/semantics/NLI/XNLI_JAIS13b_ZeroShot.py | 57 ++++++++++++++++++
 .../semantics/STS/Q2QSim_JAIS13b_ZeroShot.py  | 57 ++++++++++++++++++
 .../STS/SemEval17T1STS_JAIS13b_ZeroShot.py    | 58 +++++++++++++++++++
 .../STS/SemEval17T2STS_JAIS13b_ZeroShot.py    | 58 +++++++++++++++++++
 4 files changed, 230 insertions(+)
 create mode 100644 assets/ar/semantics/NLI/XNLI_JAIS13b_ZeroShot.py
 create mode 100644 assets/ar/semantics/STS/Q2QSim_JAIS13b_ZeroShot.py
 create mode 100644 assets/ar/semantics/STS/SemEval17T1STS_JAIS13b_ZeroShot.py
 create mode 100644 assets/ar/semantics/STS/SemEval17T2STS_JAIS13b_ZeroShot.py

diff --git a/assets/ar/semantics/NLI/XNLI_JAIS13b_ZeroShot.py b/assets/ar/semantics/NLI/XNLI_JAIS13b_ZeroShot.py
new file mode 100644
index 00000000..022bff8d
--- /dev/null
+++ b/assets/ar/semantics/NLI/XNLI_JAIS13b_ZeroShot.py
@@ -0,0 +1,57 @@
+from llmebench.datasets import XNLIDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import XNLITask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+    }
+
+
+def config():
+    return {
+        "dataset": XNLIDataset,
+        "task": XNLITask,
+        "model": FastChatModel,
+    }
+
+
+def prompt(input_sample):
+    sent1, sent2 = input_sample.split("\t")
+    prompt_text = "نقدم لك جملتين تمثلان فرضيتين. مهمتك هي تصنيف الفرضية اللاحقة بالنسبة للفرضية المسبقة تبعاً لواحدة من هذه التصنيفات: صحيح (الفرضية اللاحقة تدل على نفس الفرضية المسبقة)، خطأ (الفرضية اللاحقة تناقض الفرضية المسبقة)، أو غير معروف (حيادي). يجب أن يقتصر ردك على واحدة من هذه التصنيفات: صحيح، خطأ، أو غير معروف."
+    base_prompt = (
+        prompt_text
+        + "\nالفرضية المسبقة: "
+        + sent1
+        + "\nالفرضية اللاحقة: "
+        + sent2
+        + "\n"
+        + "التصنيف: "
+    )
+
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+
+    if "غير معروف" in input_label or "حيادي" in input_label:
+        pred_label = "neutral"
+    elif "صحيح" in input_label or "تدل" in input_label:
+        pred_label = "entailment"
+    elif "خطأ" in input_label or "تناقض" in input_label:
+        pred_label = "contradiction"
+    else:
+        print(input_label)
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/semantics/STS/Q2QSim_JAIS13b_ZeroShot.py b/assets/ar/semantics/STS/Q2QSim_JAIS13b_ZeroShot.py
new file mode 100644
index 00000000..0a06b019
--- /dev/null
+++ b/assets/ar/semantics/STS/Q2QSim_JAIS13b_ZeroShot.py
@@ -0,0 +1,57 @@
+from llmebench.datasets import STSQ2QDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import Q2QSimDetectionTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+    }
+
+
+def config():
+    return {
+        "dataset": STSQ2QDataset,
+        "task": Q2QSimDetectionTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    q1, q2 = input_sample.split("\t")
+    input_sample = q1 + "\t" + q2
+    base_prompt = f"Are the following two questions semantically similar (i.e., asking for similar information)? The output should be exactly in form yes or no.\n\n{input_sample}"
+
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    input_label = response["choices"][0]["message"]["content"]
+    input_label = input_label.replace(".", "").strip().lower()
+    pred_label = ""
+
+    if "yes" in input_label or "label: 1" in input_label:
+        pred_label = "1"
+    if (
+        input_label == "no"
+        or input_label.startswith("no,")
+        or "label: 0" in input_label
+        or "label: no" in input_label
+        or "not semantically similar" in input_label
+    ):
+        pred_label = "0"
+
+    if pred_label == "":
+        pred_label = None
+
+    return pred_label
diff --git a/assets/ar/semantics/STS/SemEval17T1STS_JAIS13b_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T1STS_JAIS13b_ZeroShot.py
new file mode 100644
index 00000000..f3bdcd6d
--- /dev/null
+++ b/assets/ar/semantics/STS/SemEval17T1STS_JAIS13b_ZeroShot.py
@@ -0,0 +1,58 @@
+from llmebench.datasets import SemEval17T1STSDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import STSTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+    }
+
+
+def config():
+    return {
+        "dataset": SemEval17T1STSDataset,
+        "task": STSTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    base_prompt = (
+        f"Given two sentences, produce a continuous valued similarity score on a "
+        f"scale from 0 to 5, with 0 indicating that the semantics of the sentences are "
+        f"completely independent and 5 signifying semantic equivalence. The output "
+        f"should be exactly in form Similarity score =. \n{input_sample}"
+    )
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    raw_response = response["choices"][0]["message"]["content"]
+
+    if "Similarity score =" in raw_response:
+        pred_num = (
+            raw_response.split("Similarity score = ")[1]
+            .strip()
+            .split(" ")[0]
+            .rstrip(".")
+        )
+        score = float(pred_num)
+    else:
+        try:
+            pred_sum = float(raw_response)
+            score = pred_sum
+        except Exception as e:
+            score = None
+
+    return score
diff --git a/assets/ar/semantics/STS/SemEval17T2STS_JAIS13b_ZeroShot.py b/assets/ar/semantics/STS/SemEval17T2STS_JAIS13b_ZeroShot.py
new file mode 100644
index 00000000..9f377eb3
--- /dev/null
+++ b/assets/ar/semantics/STS/SemEval17T2STS_JAIS13b_ZeroShot.py
@@ -0,0 +1,58 @@
+from llmebench.datasets import SemEval17T2STSDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import STSTask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+    }
+
+
+def config():
+    return {
+        "dataset": SemEval17T2STSDataset,
+        "task": STSTask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 3,
+        },
+    }
+
+
+def prompt(input_sample):
+    bsae_prompt = (
+        f"Given two sentences, produce a continuous valued similarity score on a "
+        f"scale from 0 to 5, with 0 indicating that the semantics of the sentences are "
+        f"completely independent and 5 signifying semantic equivalence. The output "
+        f"should be exactly in form Similarity score =. \n{input_sample}"
+    )
+    return [
+        {
+            "role": "user",
+            "content": bsae_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    raw_response = response["choices"][0]["message"]["content"]
+
+    if "Similarity score =" in raw_response:
+        pred_num = (
+            raw_response.split("Similarity score = ")[1]
+            .strip()
+            .split(" ")[0]
+            .rstrip(".")
+        )
+        score = float(pred_num)
+    else:
+        try:
+            pred_sum = float(raw_response)
+            score = pred_sum
+        except Exception as e:
+            score = None
+
+    return score