From 25ebf9bacd5d93a94f63058ad0d44379dd52bc01 Mon Sep 17 00:00:00 2001 From: Fahim Imaduddin Dalvi Date: Thu, 31 Aug 2023 11:16:36 +0300 Subject: [PATCH] Add missing Lemmatization assets for BLOOMZ and GPT4 --- .../Lemmatization_BLOOMZ_ZeroShot.py | 50 +++++++++++++++++ .../Lemmatization_GPT4_ZeroShot.py | 54 +++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py create mode 100644 assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py new file mode 100644 index 00000000..d6cc0bed --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_BLOOMZ_ZeroShot.py @@ -0,0 +1,50 @@ +import os + +from llmebench.datasets import LemmatizationDataset +from llmebench.models import BLOOMPetalModel +from llmebench.tasks import LemmatizationTask + + +def config(): + return { + "dataset": LemmatizationDataset, + "dataset_args": {}, + "task": LemmatizationTask, + "task_args": {}, + "model": BLOOMPetalModel, + "model_args": { + "api_url": os.environ["API_URL"], + "max_tries": 3, + }, + "general_args": { + "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt" + }, + } + + +def prompt(input_sample): + return { + "prompt": "for every word in the following Arabic sentence, write only the arabic lemma of the word separated by a single space without explanation.\n\n" + + "sentence: " + + input_sample + + "label: \n" + } + + +def post_process(response): + label = response["outputs"] + label = label.replace("label:", "") + label = label.replace("label", "") + + label = label.replace("", "") + label = label.replace("", "") + + if ( + label.startswith("Please provide the Arabic sentence") + or label.startswith("It seems") + or "is not" in label + ): + label = None + + # TODO: fix hack to handle prediction failure + return (None, label.strip()) diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py new file mode 100644 index 00000000..8be1b2cc --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/Lemmatization_GPT4_ZeroShot.py @@ -0,0 +1,54 @@ +import os + +from llmebench.datasets import LemmatizationDataset +from llmebench.models import GPTChatCompletionModel +from llmebench.tasks import LemmatizationTask + + +def config(): + return { + "dataset": LemmatizationDataset, + "dataset_args": {}, + "task": LemmatizationTask, + "task_args": {}, + "model": GPTChatCompletionModel, + "model_args": { + "api_type": "azure", + "api_version": "2023-03-15-preview", + "api_base": os.environ["AZURE_API_URL"], + "api_key": os.environ["AZURE_API_KEY"], + "engine_name": os.environ["ENGINE_NAME"], + "max_tries": 3, + }, + "general_args": { + "data_path": "data/sequence_tagging_ner_pos_etc/lemmatization/WikiNews-26-06-2015-RefLemma.txt" + }, + } + + +def prompt(input_sample): + return [ + { + "role": "system", + "content": "You are a language expert, you can identify the lemma of any word within a sentence.", + }, + { + "role": "user", + "content": f"for every word in the following Arabic word, write only the lemma without diacritics separated by a single space without explanation:\n {input_sample}", + }, + ] + + +def post_process(response): + x = response["choices"][0]["message"]["content"] + if ( + x.startswith("Please provide the Arabic sentence") + or x.startswith("It seems") + or "is not" in x + ): + out = None + else: + # TODO: fix hack to handle prediction failure + out = (None, x) + + return out