From 891654db9e081d795a63e89c2636c5d89dbd3371 Mon Sep 17 00:00:00 2001
From: Arid Hasan <18038960+AridHasan@users.noreply.github.com>
Date: Wed, 31 Jan 2024 07:22:31 -0400
Subject: [PATCH] Add Jais ZeroShot assets for Arabic QA tasks (#258)

* Add JAIS13b for arabic QA

* Format code and minor metadata changes

---------

Co-authored-by: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
---
 assets/ar/QA/ARCD_JAIS13b_ZeroShot.py   | 33 ++++++++++++++++++++++
 assets/ar/QA/MLQA_JAIS13b_ZeroShot.py   | 36 ++++++++++++++++++++++++
 assets/ar/QA/TyDiQA_JAIS13b_ZeroShot.py | 37 +++++++++++++++++++++++++
 assets/ar/QA/XQuAD_JAIS13b_ZeroShot.py  | 36 ++++++++++++++++++++++++
 4 files changed, 142 insertions(+)
 create mode 100644 assets/ar/QA/ARCD_JAIS13b_ZeroShot.py
 create mode 100644 assets/ar/QA/MLQA_JAIS13b_ZeroShot.py
 create mode 100644 assets/ar/QA/TyDiQA_JAIS13b_ZeroShot.py
 create mode 100644 assets/ar/QA/XQuAD_JAIS13b_ZeroShot.py

diff --git a/assets/ar/QA/ARCD_JAIS13b_ZeroShot.py b/assets/ar/QA/ARCD_JAIS13b_ZeroShot.py
new file mode 100644
index 00000000..5df663e3
--- /dev/null
+++ b/assets/ar/QA/ARCD_JAIS13b_ZeroShot.py
@@ -0,0 +1,33 @@
+from llmebench.datasets import ARCDDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import QATask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+    }
+
+
+def config():
+    return {
+        "dataset": ARCDDataset,
+        "task": QATask,
+        "model": FastChatModel,
+    }
+
+
+def prompt(input_sample):
+    base_prompt = f"مهمتك هي الإجابة على الأسئلة باللغة العربية بناءً على سياق معين.\nملاحظة: يجب أن تكون إجاباتك مستخرجة من السياق المحدد دون أي اضافات.\nلست بحاجة إلى تقديم إجابة كاملة.\nالسياق: {input_sample['context']}\n السؤال: {input_sample['question']}\n الجواب:"
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    return response["choices"][0]["message"]["content"]
diff --git a/assets/ar/QA/MLQA_JAIS13b_ZeroShot.py b/assets/ar/QA/MLQA_JAIS13b_ZeroShot.py
new file mode 100644
index 00000000..603eafe5
--- /dev/null
+++ b/assets/ar/QA/MLQA_JAIS13b_ZeroShot.py
@@ -0,0 +1,36 @@
+from llmebench.datasets import MLQADataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import QATask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+    }
+
+
+def config():
+    return {
+        "dataset": MLQADataset,
+        "task": QATask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 50,
+        },
+    }
+
+
+def prompt(input_sample):
+    base_prompt = f"Your task is to answer questions in Arabic based on a given context.\nNote: Your answers should be spans extracted from the given context without any illustrations.\nYou don't need to provide a complete answer\nContext:{input_sample['context']}\nQuestion:{input_sample['question']}\nAnswer:"
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    return response["choices"][0]["message"]["content"]
diff --git a/assets/ar/QA/TyDiQA_JAIS13b_ZeroShot.py b/assets/ar/QA/TyDiQA_JAIS13b_ZeroShot.py
new file mode 100644
index 00000000..896a800d
--- /dev/null
+++ b/assets/ar/QA/TyDiQA_JAIS13b_ZeroShot.py
@@ -0,0 +1,37 @@
+from llmebench.datasets import TyDiQADataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import QATask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+    }
+
+
+def config():
+    return {
+        "dataset": TyDiQADataset,
+        "task": QATask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 50,
+        },
+        "general_args": {"test_split": "dev"},
+    }
+
+
+def prompt(input_sample):
+    base_prompt = f"Your task is to answer questions in Arabic based on a given context.\nNote: Your answers should be spans extracted from the given context without any illustrations.\nYou don't need to provide a complete answer\nContext:{input_sample['context']}\nQuestion:{input_sample['question']}\nAnswer:"
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    return response["choices"][0]["message"]["content"]
diff --git a/assets/ar/QA/XQuAD_JAIS13b_ZeroShot.py b/assets/ar/QA/XQuAD_JAIS13b_ZeroShot.py
new file mode 100644
index 00000000..66bbeacd
--- /dev/null
+++ b/assets/ar/QA/XQuAD_JAIS13b_ZeroShot.py
@@ -0,0 +1,36 @@
+from llmebench.datasets import XQuADDataset
+from llmebench.models import FastChatModel
+from llmebench.tasks import QATask
+
+
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Jais-13b-chat",
+        "description": "Locally hosted Jais-13b-chat model using FastChat.",
+    }
+
+
+def config():
+    return {
+        "dataset": XQuADDataset,
+        "task": QATask,
+        "model": FastChatModel,
+        "model_args": {
+            "max_tries": 50,
+        },
+    }
+
+
+def prompt(input_sample):
+    base_prompt = f"Your task is to answer questions in Arabic based on a given context.\nNote: Your answers should be spans extracted from the given context without any illustrations.\nYou don't need to provide a complete answer\nContext:{input_sample['context']}\nQuestion:{input_sample['question']}\nAnswer:"
+    return [
+        {
+            "role": "user",
+            "content": base_prompt,
+        },
+    ]
+
+
+def post_process(response):
+    return response["choices"][0]["message"]["content"]