From 25ebf9bacd5d93a94f63058ad0d44379dd52bc01 Mon Sep 17 00:00:00 2001
From: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
Date: Thu, 31 Aug 2023 11:16:36 +0300
Subject: [PATCH] Add missing Lemmatization assets for BLOOMZ and GPT4

---
 .../Lemmatization_BLOOMZ_ZeroShot.py          | 50 +++++++++++++++++
 .../Lemmatization_GPT4_ZeroShot.py            | 54 +++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py
diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py
new file mode 100644
index 00000000..d6cc0bed
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py
@@ -0,0 +1,50 @@
+import os
+
+from llmebench.datasets import LemmatizationDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import LemmatizationTask
+
+
+def config():
+    return {
+        "dataset": LemmatizationDataset,
+        "dataset_args": {},
+        "task": LemmatizationTask,
+        "task_args": {},
+        "model": BLOOMPetalModel,
+        "model_args": {
+            "api_url": os.environ["API_URL"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return {
+        "prompt": "for every word in the following Arabic sentence, write only the arabic lemma of the word separated by a single space without explanation.\n\n"
+        + "sentence: "
+        + input_sample
+        + "label: \n"
+    }
+
+
+def post_process(response):
+    label = response["outputs"]
+    label = label.replace("label:", "")
+    label = label.replace("label", "")
+
+    label = label.replace("<s>", "")
+    label = label.replace("</s>", "")
+
+    if (
+        label.startswith("Please provide the Arabic sentence")
+        or label.startswith("It seems")
+        or "is not" in label
+    ):
+        label = None
+
+    # TODO: fix hack to handle prediction failure
+    return (None, label.strip())
diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py
new file mode 100644
index 00000000..8be1b2cc
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py
@@ -0,0 +1,54 @@
+import os
+
+from llmebench.datasets import LemmatizationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import LemmatizationTask
+
+
+def config():
+    return {
+        "dataset": LemmatizationDataset,
+        "dataset_args": {},
+        "task": LemmatizationTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "system",
+            "content": "You are a language expert, you can identify the lemma of any word within a sentence.",
+        },
+        {
+            "role": "user",
+            "content": f"for every word in the following Arabic word, write only the lemma without diacritics separated by a single space without explanation:\n {input_sample}",
+        },
+    ]
+
+
+def post_process(response):
+    x = response["choices"][0]["message"]["content"]
+    if (
+        x.startswith("Please provide the Arabic sentence")
+        or x.startswith("It seems")
+        or "is not" in x
+    ):
+        out = None
+    else:
+        # TODO: fix hack to handle prediction failure
+        out = (None, x)
+
+    return out