Add missing Lemmatization assets for BLOOMZ and GPT4

qcri · Aug 31, 2023 · 25ebf9b · 25ebf9b
1 parent ccafb6f
commit 25ebf9b
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 0 deletions.
diff --git a/...equence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py b/...equence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py
@@ -0,0 +1,50 @@
+import os
+
+from llmebench.datasets import LemmatizationDataset
+from llmebench.models import BLOOMPetalModel
+from llmebench.tasks import LemmatizationTask
+
+
+def config():
+    return {
+        "dataset": LemmatizationDataset,
+        "dataset_args": {},
+        "task": LemmatizationTask,
+        "task_args": {},
+        "model": BLOOMPetalModel,
+        "model_args": {
+            "api_url": os.environ["API_URL"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return {
+        "prompt": "for every word in the following Arabic sentence, write only the arabic lemma of the word separated by a single space without explanation.\n\n"
+        + "sentence: "
+        + input_sample
+        + "label: \n"
+    }
+
+
+def post_process(response):
+    label = response["outputs"]
+    label = label.replace("label:", "")
+    label = label.replace("label", "")
+
+    label = label.replace("<s>", "")
+    label = label.replace("</s>", "")
+
+    if (
+        label.startswith("Please provide the Arabic sentence")
+        or label.startswith("It seems")
+        or "is not" in label
+    ):
+        label = None
+
+    # TODO: fix hack to handle prediction failure
+    return (None, label.strip())
diff --git a/.../sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py b/.../sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py
@@ -0,0 +1,54 @@
+import os
+
+from llmebench.datasets import LemmatizationDataset
+from llmebench.models import GPTChatCompletionModel
+from llmebench.tasks import LemmatizationTask
+
+
+def config():
+    return {
+        "dataset": LemmatizationDataset,
+        "dataset_args": {},
+        "task": LemmatizationTask,
+        "task_args": {},
+        "model": GPTChatCompletionModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return [
+        {
+            "role": "system",
+            "content": "You are a language expert, you can identify the lemma of any word within a sentence.",
+        },
+        {
+            "role": "user",
+            "content": f"for every word in the following Arabic word, write only the lemma without diacritics separated by a single space without explanation:\n {input_sample}",
+        },
+    ]
+
+
+def post_process(response):
+    x = response["choices"][0]["message"]["content"]
+    if (
+        x.startswith("Please provide the Arabic sentence")
+        or x.startswith("It seems")
+        or "is not" in x
+    ):
+        out = None
+    else:
+        # TODO: fix hack to handle prediction failure
+        out = (None, x)
+
+    return out