From 97dae47e99a7b6af5b2b4040135da908949c697b Mon Sep 17 00:00:00 2001
From: Basel Mousi <bmousi@hbku.edu.qa>
Date: Sun, 27 Aug 2023 14:30:49 +0300
Subject: [PATCH 1/7] add temp mt file

---
 .../MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py
index 68db7bb7..a550b839 100644
--- a/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py
+++ b/assets/benchmark_v1/MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py
@@ -44,11 +44,11 @@ def config():
         "madar.test.nil.0.sd",
         "madar.test.nil.1.eg",
         "madar.test.nil.2.eg",
-        "summa-2M.test.mgr.0.ma",
-        "summa-AJ.test.msa.0.ms",
-        "summa-BBC.test.msa.0.ms",
-        "summa-LBC.test.lev.0.lb",
-        "summa-Oman.test.glf.0.om",
+        # "summa-2M.test.mgr.0.ma",
+        # "summa-AJ.test.msa.0.ms",
+        # "summa-BBC.test.msa.0.ms",
+        # "summa-LBC.test.lev.0.lb",
+        # "summa-Oman.test.glf.0.om",
     ]
     configs = []
     for testset in sets:

From 2e11bca6d0252fe566b3fb953b489b1861e13e79 Mon Sep 17 00:00:00 2001
From: Basel Mousi <bmousi@hbku.edu.qa>
Date: Sat, 9 Sep 2023 15:01:18 +0300
Subject: [PATCH 2/7] added data loader for the Shami Corpus

---
 llmebench/datasets/ShamiCorpus.py | 40 +++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 llmebench/datasets/ShamiCorpus.py

diff --git a/llmebench/datasets/ShamiCorpus.py b/llmebench/datasets/ShamiCorpus.py
new file mode 100644
index 00000000..39907147
--- /dev/null
+++ b/llmebench/datasets/ShamiCorpus.py
@@ -0,0 +1,40 @@
+from llmebench.datasets.dataset_base import DatasetBase
+from pathlib import Path
+
+class ShamiDataset(DatasetBase): 
+    def __init__(self, **kwargs): 
+        super(ShamiDataset, self).__init__(**kwargs) 
+    def metadata(): 
+        return { 
+            "language":"ar", 
+            "citation": """ @inproceedings{abu-kwaik-etal-2018-shami,
+            title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects",
+            author = "Abu Kwaik, Kathrein  and
+            Saad, Motaz  and
+            Chatzikyriakidis, Stergios  and
+            Dobnik, Simon",
+            booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
+            month = may,
+            year = "2018",
+            address = "Miyazaki, Japan",
+            publisher = "European Language Resources Association (ELRA)",
+            url = "https://aclanthology.org/L18-1576",
+        }
+"""
+        }
+    def get_data_sample(self): 
+        return {"input": "a sentence", "label": "dialect of sentence"} 
+    
+    def load_data(self, data_path, no_labels=False): 
+        data = []
+        filenames= ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"]
+        for name in filenames: 
+            path = Path(data_path) / name 
+            with open(path, "r") as reader: 
+                for line in reader: 
+                    sentence = line.strip() 
+                    label = name.split(".")[0]
+                    data.append( 
+                        {"input": sentence, "label": label}
+                    )
+        return data

From 80aaeb419b870c74faf13baea69d08d1d48c1503 Mon Sep 17 00:00:00 2001
From: Basel Mousi <bmousi@hbku.edu.qa>
Date: Sat, 9 Sep 2023 15:41:15 +0300
Subject: [PATCH 3/7] exported the dataset in init file

---
 llmebench/datasets/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py
index 65cba2c7..2c47d72b 100644
--- a/llmebench/datasets/__init__.py
+++ b/llmebench/datasets/__init__.py
@@ -49,3 +49,4 @@
 from .XGLUEPOS import XGLUEPOSDataset
 from .XNLI import XNLIDataset
 from .XQuAD import XQuADDataset
+from .ShamiCorpus import ShamiDataset
\ No newline at end of file

From 645958ebc2fb0727ce05106704a00f7e95702759 Mon Sep 17 00:00:00 2001
From: Basel Mousi <bmousi@hbku.edu.qa>
Date: Sat, 9 Sep 2023 15:42:07 +0300
Subject: [PATCH 4/7] added a chatgpt ZS asset for dialect identification on
 Shami Corpus

---
 .../ShamiCorpus_GPT35_ZeroShot.py             | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py

diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py
new file mode 100644
index 00000000..62852413
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py
@@ -0,0 +1,54 @@
+import os
+from llmebench.datasets import ShamiDataset
+from llmebench.models import LegacyOpenAIModel
+from llmebench.tasks import DialectIDTask
+
+
+
+def config():
+    return {
+        "dataset": ShamiDataset,
+        "dataset_args": {},
+        "task": DialectIDTask,
+        "task_args": {},
+        "model": LegacyOpenAIModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "class_labels": [ 
+                "Lebanese", 
+                "Jordanian",
+                "Palestinian", 
+                "Syrian"
+
+            ],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/dialect-data/shami-corpus"
+        },
+    }
+
+
+def prompt(input_sample): 
+    prompt_string = (
+        f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
+    )
+
+    return { 
+        "system_message": "You are an AI assistant that helps people find information.", 
+        "messages": [ 
+            { 
+                "sender": "user", 
+                "text": prompt_string
+            }
+        ]
+    }
+
+
+def post_process(response): 
+    label = response["choices"][0]["text"]
+    return label
\ No newline at end of file

From a2ca6d553d8854215ee782d9e2a177866999c602 Mon Sep 17 00:00:00 2001
From: Basel Mousi <bmousi@hbku.edu.qa>
Date: Sat, 9 Sep 2023 15:50:04 +0300
Subject: [PATCH 5/7] Added gpt4 zero-shot asset for dialect identification on
 Shami Corpus

---
 .../ShamiCorpus_GPT4_ZeroShot.py              | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py

diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py
new file mode 100644
index 00000000..8a7217c9
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py
@@ -0,0 +1,55 @@
+import os
+from llmebench.datasets import ShamiDataset
+from llmebench.models import OpenAIModel
+from llmebench.tasks import DialectIDTask
+
+
+
+def config():
+    return {
+        "dataset": ShamiDataset,
+        "dataset_args": {},
+        "task": DialectIDTask,
+        "task_args": {},
+        "model": OpenAIModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "class_labels": [ 
+                "Lebanese", 
+                "Jordanian",
+                "Palestinian", 
+                "Syrian"
+
+            ],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/dialect-data/shami-corpus"
+        },
+    }
+
+
+def prompt(input_sample): 
+    prompt_string = (
+        f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
+    )
+
+    return [
+        {
+            "role": "system",
+            "content": "You are an AI assistant that helps people find information.",
+        },
+        {
+            "role": "user",
+            "content": prompt_string,
+        },
+    ]
+
+
+def post_process(response): 
+    label = response["choices"][0]["message"]["content"]
+    return label
\ No newline at end of file

From 2448d547f3dcaf597800cf3b6d5c0813bc777b24 Mon Sep 17 00:00:00 2001
From: Basel Mousi <bmousi@hbku.edu.qa>
Date: Sat, 9 Sep 2023 16:14:43 +0300
Subject: [PATCH 6/7] Added zero shot asset for dialect identification - bloom

---
 .../ShamiCorpus_BLOOMZ_ZeroShot.py            | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py

diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py
new file mode 100644
index 00000000..5e765df4
--- /dev/null
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py
@@ -0,0 +1,51 @@
+import os
+
+from llmebench.datasets import ShamiDataset
+from llmebench.models import PetalsModel
+from llmebench.tasks import DialectIDTask
+
+
+def config():
+    return {
+        "dataset": ShamiDataset,
+        "dataset_args": {},
+        "task": DialectIDTask,
+        "task_args": {},
+        "model": PetalsModel,
+        "model_args": {
+            "api_url": os.environ["API_URL"],
+            "class_labels": [
+
+                "Lebanese", 
+                "Jordanian", 
+                "Palestinian", 
+                "Syrian"             
+            ],
+            "max_tries": 22,
+        },
+        "general_args": {
+            "data_path": "data/dialect-data/shami-corpus",
+        },
+    }
+
+
+def prompt(input_sample):
+    prompt_string = (
+        f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
+    )
+
+    return {
+        "prompt": prompt_string,
+    }
+
+
+def post_process(response):
+    label = response["outputs"].strip()
+    #label = label.replace("<s>", "")
+    label = label.replace("</s>", "")
+    # label = label.replace("Dialect: ", "").replace("dialect: ", "")
+    # label = label.replace("label: ", "")
+    # label = label.strip()
+
+
+    return label

From f0fb456e3de531fe7ff1e1c15bb01ee1f840268f Mon Sep 17 00:00:00 2001
From: Basel Mousi <bmousi@hbku.edu.qa>
Date: Sat, 9 Sep 2023 16:18:00 +0300
Subject: [PATCH 7/7] formatted code for shami corpus dialect identification

---
 .../ShamiCorpus_BLOOMZ_ZeroShot.py            | 15 ++-----
 .../ShamiCorpus_GPT35_ZeroShot.py             | 35 +++++----------
 .../ShamiCorpus_GPT4_ZeroShot.py              | 24 +++-------
 llmebench/datasets/ShamiCorpus.py             | 44 ++++++++++---------
 llmebench/datasets/__init__.py                |  2 +-
 5 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py
index 5e765df4..573ecf0f 100644
--- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_BLOOMZ_ZeroShot.py
@@ -14,13 +14,7 @@ def config():
         "model": PetalsModel,
         "model_args": {
             "api_url": os.environ["API_URL"],
-            "class_labels": [
-
-                "Lebanese", 
-                "Jordanian", 
-                "Palestinian", 
-                "Syrian"             
-            ],
+            "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
             "max_tries": 22,
         },
         "general_args": {
@@ -30,9 +24,7 @@ def config():
 
 
 def prompt(input_sample):
-    prompt_string = (
-        f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
-    )
+    prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
 
     return {
         "prompt": prompt_string,
@@ -41,11 +33,10 @@ def prompt(input_sample):
 
 def post_process(response):
     label = response["outputs"].strip()
-    #label = label.replace("<s>", "")
+    # label = label.replace("<s>", "")
     label = label.replace("</s>", "")
     # label = label.replace("Dialect: ", "").replace("dialect: ", "")
     # label = label.replace("label: ", "")
     # label = label.strip()
 
-
     return label
diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py
index 62852413..610f3c6d 100644
--- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT35_ZeroShot.py
@@ -1,10 +1,10 @@
 import os
+
 from llmebench.datasets import ShamiDataset
 from llmebench.models import LegacyOpenAIModel
 from llmebench.tasks import DialectIDTask
 
 
-
 def config():
     return {
         "dataset": ShamiDataset,
@@ -18,37 +18,22 @@ def config():
             "api_base": os.environ["AZURE_API_URL"],
             "api_key": os.environ["AZURE_API_KEY"],
             "engine_name": os.environ["ENGINE_NAME"],
-            "class_labels": [ 
-                "Lebanese", 
-                "Jordanian",
-                "Palestinian", 
-                "Syrian"
-
-            ],
+            "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
             "max_tries": 3,
         },
-        "general_args": {
-            "data_path": "data/dialect-data/shami-corpus"
-        },
+        "general_args": {"data_path": "data/dialect-data/shami-corpus"},
     }
 
 
-def prompt(input_sample): 
-    prompt_string = (
-        f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
-    )
+def prompt(input_sample):
+    prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
 
-    return { 
-        "system_message": "You are an AI assistant that helps people find information.", 
-        "messages": [ 
-            { 
-                "sender": "user", 
-                "text": prompt_string
-            }
-        ]
+    return {
+        "system_message": "You are an AI assistant that helps people find information.",
+        "messages": [{"sender": "user", "text": prompt_string}],
     }
 
 
-def post_process(response): 
+def post_process(response):
     label = response["choices"][0]["text"]
-    return label
\ No newline at end of file
+    return label
diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py
index 8a7217c9..f2480e82 100644
--- a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py
+++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ShamiCorpus_GPT4_ZeroShot.py
@@ -1,10 +1,10 @@
 import os
+
 from llmebench.datasets import ShamiDataset
 from llmebench.models import OpenAIModel
 from llmebench.tasks import DialectIDTask
 
 
-
 def config():
     return {
         "dataset": ShamiDataset,
@@ -18,25 +18,15 @@ def config():
             "api_base": os.environ["AZURE_API_URL"],
             "api_key": os.environ["AZURE_API_KEY"],
             "engine_name": os.environ["ENGINE_NAME"],
-            "class_labels": [ 
-                "Lebanese", 
-                "Jordanian",
-                "Palestinian", 
-                "Syrian"
-
-            ],
+            "class_labels": ["Lebanese", "Jordanian", "Palestinian", "Syrian"],
             "max_tries": 3,
         },
-        "general_args": {
-            "data_path": "data/dialect-data/shami-corpus"
-        },
+        "general_args": {"data_path": "data/dialect-data/shami-corpus"},
     }
 
 
-def prompt(input_sample): 
-    prompt_string = (
-        f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
-    )
+def prompt(input_sample):
+    prompt_string = f"Task Description: You are an expert in identifying the dialect of a given arabic text. You will be given a text and you should output the dialect to which the text belongs.\nNote: Please make sure that the class that you output is one of the following: Lebanese, Jordanian, Palestinian, or Syrian.\n Output the class only without any illustrations\nInput:{input_sample} \nLabel: "
 
     return [
         {
@@ -50,6 +40,6 @@ def prompt(input_sample):
     ]
 
 
-def post_process(response): 
+def post_process(response):
     label = response["choices"][0]["message"]["content"]
-    return label
\ No newline at end of file
+    return label
diff --git a/llmebench/datasets/ShamiCorpus.py b/llmebench/datasets/ShamiCorpus.py
index 39907147..ca6503b9 100644
--- a/llmebench/datasets/ShamiCorpus.py
+++ b/llmebench/datasets/ShamiCorpus.py
@@ -1,12 +1,15 @@
-from llmebench.datasets.dataset_base import DatasetBase
 from pathlib import Path
 
-class ShamiDataset(DatasetBase): 
-    def __init__(self, **kwargs): 
-        super(ShamiDataset, self).__init__(**kwargs) 
-    def metadata(): 
-        return { 
-            "language":"ar", 
+from llmebench.datasets.dataset_base import DatasetBase
+
+
+class ShamiDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(ShamiDataset, self).__init__(**kwargs)
+
+    def metadata():
+        return {
+            "language": "ar",
             "citation": """ @inproceedings{abu-kwaik-etal-2018-shami,
             title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects",
             author = "Abu Kwaik, Kathrein  and
@@ -20,21 +23,20 @@ def metadata():
             publisher = "European Language Resources Association (ELRA)",
             url = "https://aclanthology.org/L18-1576",
         }
-"""
+""",
         }
-    def get_data_sample(self): 
-        return {"input": "a sentence", "label": "dialect of sentence"} 
-    
-    def load_data(self, data_path, no_labels=False): 
+
+    def get_data_sample(self):
+        return {"input": "a sentence", "label": "dialect of sentence"}
+
+    def load_data(self, data_path, no_labels=False):
         data = []
-        filenames= ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"]
-        for name in filenames: 
-            path = Path(data_path) / name 
-            with open(path, "r") as reader: 
-                for line in reader: 
-                    sentence = line.strip() 
+        filenames = ["Jordanian.txt", "Lebanese.txt", "Palestinian.txt", "Syrian.txt"]
+        for name in filenames:
+            path = Path(data_path) / name
+            with open(path, "r") as reader:
+                for line in reader:
+                    sentence = line.strip()
                     label = name.split(".")[0]
-                    data.append( 
-                        {"input": sentence, "label": label}
-                    )
+                    data.append({"input": sentence, "label": label})
         return data
diff --git a/llmebench/datasets/__init__.py b/llmebench/datasets/__init__.py
index 2c47d72b..54c0b9a3 100644
--- a/llmebench/datasets/__init__.py
+++ b/llmebench/datasets/__init__.py
@@ -36,6 +36,7 @@
 from .SemEval17T1STS import SemEval17T1STSDataset
 from .SemEval17T2STS import SemEval17T2STSDataset
 from .SemEval23T3Propaganda import SemEval23T3PropagandaDataset
+from .ShamiCorpus import ShamiDataset
 from .Spam import SpamDataset
 from .STSQ2Q import Q2QSimDataset
 from .TyDiQA import TyDiQADataset
@@ -49,4 +50,3 @@
 from .XGLUEPOS import XGLUEPOSDataset
 from .XNLI import XNLIDataset
 from .XQuAD import XQuADDataset
-from .ShamiCorpus import ShamiDataset
\ No newline at end of file