branch rebase

qcri · Sep 10, 2023 · 8be02c5 · 8be02c5
2 parents 060609e + 421839f
commit 8be02c5
Show file tree

Hide file tree

Showing 289 changed files with 3,511 additions and 3,060 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,6 @@ data
 
 # Temporary
 tmp
+
+# Model configs
+envs
diff --git a/CITATION.bib b/CITATION.bib
@@ -0,0 +1,9 @@
+@article{dalvi2023llmebench,
+      title={LLMeBench: A Flexible Framework for Accelerating LLMs Benchmarking}, 
+      author={Fahim Dalvi and Maram Hasanain and Sabri Boughorbel and Basel Mousi and Samir Abdaljalil and Nizi Nazar and Ahmed Abdelali and Shammur Absar Chowdhury and Hamdy Mubarak and Ahmed Ali and Majd Hawasly and Nadir Durrani and Firoj Alam},
+      year={2023},
+      eprint={2308.04945},
+      journal={arXiv:2308.04945},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2308.04945}
+}
diff --git a/.../MT/AraBench_Ara2Eng_FastChat_ZeroShot.py → .../MT/AraBench_Ara2Eng_FastChat_ZeroShot.py b/.../MT/AraBench_Ara2Eng_FastChat_ZeroShot.py → .../MT/AraBench_Ara2Eng_FastChat_ZeroShot.py
diff --git a/...v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py → ...s/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py b/...v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py → ...s/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import AraBenchDataset
-from llmebench.models import BLOOMPetalModel
+from llmebench.models import PetalsModel
 from llmebench.tasks import MachineTranslationTask
 
 
@@ -59,9 +57,8 @@ def config():
                     },
                     "task": MachineTranslationTask,
                     "task_args": {},
-                    "model": BLOOMPetalModel,
+                    "model": PetalsModel,
                     "model_args": {
-                        "api_url": os.environ["API_URL"],
                         "max_tries": 3,
                     },
                     "general_args": {"data_path": "data/MT/"},

diff --git a/...1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py → ...ts/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py b/...1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py → ...ts/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import AraBenchDataset
-from llmebench.models import GPTModel
+from llmebench.models import LegacyOpenAIModel
 from llmebench.tasks import MachineTranslationTask
 
 
@@ -59,13 +57,8 @@ def config():
                     },
                     "task": MachineTranslationTask,
                     "task_args": {},
-                    "model": GPTModel,
+                    "model": LegacyOpenAIModel,
                     "model_args": {
-                        "api_type": "azure",
-                        "api_version": "2023-03-15-preview",
-                        "api_base": os.environ["AZURE_API_URL"],
-                        "api_key": os.environ["AZURE_API_KEY"],
-                        "engine_name": os.environ["ENGINE_NAME"],
                         "max_tries": 5,
                     },
                     "general_args": {"data_path": "data/MT/"},

diff --git a/.../MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py → assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py b/.../MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py → assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import AraBenchDataset
-from llmebench.models import GPTChatCompletionModel
+from llmebench.models import OpenAIModel
 from llmebench.tasks import MachineTranslationTask
 
 
@@ -58,13 +56,8 @@ def config():
                     },
                     "task": MachineTranslationTask,
                     "task_args": {},
-                    "model": GPTChatCompletionModel,
+                    "model": OpenAIModel,
                     "model_args": {
-                        "api_type": "azure",
-                        "api_version": "2023-03-15-preview",
-                        "api_base": os.environ["AZURE_API_URL"],
-                        "api_key": os.environ["AZURE_API_KEY"],
-                        "engine_name": os.environ["ENGINE_NAME"],
                         "max_tries": 5,
                     },
                     "general_args": {"data_path": "data/MT/"},

diff --git a/...s/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py → assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py b/...s/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py → assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import ARCDDataset
-from llmebench.models import BLOOMPetalModel
+from llmebench.models import PetalsModel
 from llmebench.tasks import QATask
 
 
@@ -11,9 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": BLOOMPetalModel,
+        "model": PetalsModel,
         "model_args": {
-            "api_url": os.environ["API_URL"],
             "max_tries": 5,
         },
         "general_args": {"data_path": "data/QA/ARCD/arcd-test.json"},

diff --git a/.../benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py → assets/ar/QA/ARCD_GPT35_ZeroShot.py b/.../benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py → assets/ar/QA/ARCD_GPT35_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import ARCDDataset
-from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.models import LegacyOpenAIModel
 from llmebench.tasks import QATask
 
 
@@ -11,13 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTModel,
+        "model": LegacyOpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
             "max_tries": 3,
         },
         "general_args": {"data_path": "data/QA/ARCD/arcd-test.json"},

diff --git a/...k_v1/QA/ARCD_GPTChatCompletion_FewShot.py → assets/ar/QA/ARCD_GPT4_FewShot.py b/...k_v1/QA/ARCD_GPTChatCompletion_FewShot.py → assets/ar/QA/ARCD_GPT4_FewShot.py
@@ -1,8 +1,7 @@
-import os
 import random
 
 from llmebench.datasets import ARCDDataset
-from llmebench.models import GPTChatCompletionModel
+from llmebench.models import OpenAIModel
 from llmebench.tasks import QATask
 
 random.seed(3333)
@@ -14,14 +13,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTChatCompletionModel,
+        "model": OpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
-            "class_labels": "NA",
             "max_tries": 30,
         },
         "general_args": {

diff --git a/..._v1/QA/ARCD_GPTChatCompletion_ZeroShot.py → assets/ar/QA/ARCD_GPT4_ZeroShot.py b/..._v1/QA/ARCD_GPTChatCompletion_ZeroShot.py → assets/ar/QA/ARCD_GPT4_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import ARCDDataset
-from llmebench.models import GPTChatCompletionModel
+from llmebench.models import OpenAIModel
 from llmebench.tasks import QATask
 
 
@@ -11,14 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTChatCompletionModel,
+        "model": OpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
-            "class_labels": "NA",
             "max_tries": 50,
         },
         "general_args": {"data_path": "data/QA/arcd/arcd-test.json"},

diff --git a/...s/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py → assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py b/...s/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py → assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import MLQADataset
-from llmebench.models import BLOOMPetalModel
+from llmebench.models import PetalsModel
 from llmebench.tasks import QATask
 
 
@@ -11,9 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": BLOOMPetalModel,
+        "model": PetalsModel,
         "model_args": {
-            "api_url": os.environ["API_URL"],
             "max_tries": 5,
         },
         "general_args": {

diff --git a/.../benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py → assets/ar/QA/MLQA_GPT35_ZeroShot.py b/.../benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py → assets/ar/QA/MLQA_GPT35_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import MLQADataset
-from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.models import LegacyOpenAIModel
 from llmebench.tasks import QATask
 
 
@@ -11,13 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTModel,
+        "model": LegacyOpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
             "max_tries": 3,
         },
         "general_args": {

diff --git a/...k_v1/QA/MLQA_GPTChatCompletion_FewShot.py → assets/ar/QA/MLQA_GPT4_FewShot.py b/...k_v1/QA/MLQA_GPTChatCompletion_FewShot.py → assets/ar/QA/MLQA_GPT4_FewShot.py
@@ -1,8 +1,7 @@
-import os
 import random
 
 from llmebench.datasets import MLQADataset
-from llmebench.models import GPTChatCompletionModel
+from llmebench.models import OpenAIModel
 from llmebench.tasks import QATask
 
 random.seed(3333)
@@ -14,14 +13,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTChatCompletionModel,
+        "model": OpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
-            "class_labels": "NA",
             "max_tries": 30,
         },
         "general_args": {

diff --git a/..._v1/QA/MLQA_GPTChatCompletion_ZeroShot.py → assets/ar/QA/MLQA_GPT4_ZeroShot.py b/..._v1/QA/MLQA_GPTChatCompletion_ZeroShot.py → assets/ar/QA/MLQA_GPT4_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import MLQADataset
-from llmebench.models import GPTChatCompletionModel
+from llmebench.models import OpenAIModel
 from llmebench.tasks import QATask
 
 
@@ -11,14 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTChatCompletionModel,
+        "model": OpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
-            "class_labels": "NA",
             "max_tries": 50,
         },
         "general_args": {

diff --git a/...benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py → assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py b/...benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py → assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import TyDiQADataset
-from llmebench.models import BLOOMPetalModel
+from llmebench.models import PetalsModel
 from llmebench.tasks import QATask
 
 
@@ -11,9 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": BLOOMPetalModel,
+        "model": PetalsModel,
         "model_args": {
-            "api_url": os.environ["API_URL"],
             "max_tries": 5,
         },
         "general_args": {"data_path": "data/QA/tydiqa/tydiqa-goldp-dev-arabic.json"},

diff --git a/...enchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py → assets/ar/QA/TyDiQA_GPT35_ZeroShot.py b/...enchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py → assets/ar/QA/TyDiQA_GPT35_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import TyDiQADataset
-from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.models import LegacyOpenAIModel
 from llmebench.tasks import QATask
 
 
@@ -11,13 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTModel,
+        "model": LegacyOpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
             "max_tries": 3,
         },
         "general_args": {"data_path": "data/QA/tydiqa/tydiqa-goldp-dev-arabic.json"},

diff --git a/...v1/QA/TyDiQA_GPTChatCompletion_FewShot.py → assets/ar/QA/TyDiQA_GPT4_FewShot.py b/...v1/QA/TyDiQA_GPTChatCompletion_FewShot.py → assets/ar/QA/TyDiQA_GPT4_FewShot.py
@@ -1,8 +1,7 @@
-import os
 import random
 
 from llmebench.datasets import TyDiQADataset
-from llmebench.models import GPTChatCompletionModel
+from llmebench.models import OpenAIModel
 from llmebench.tasks import QATask
 
 random.seed(3333)
@@ -14,14 +13,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTChatCompletionModel,
+        "model": OpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
-            "class_labels": "NA",
             "max_tries": 30,
         },
         "general_args": {

diff --git a/...1/QA/TydiQA_GPTChatCompletion_ZeroShot.py → assets/ar/QA/TydiQA_GPT4_ZeroShot.py b/...1/QA/TydiQA_GPTChatCompletion_ZeroShot.py → assets/ar/QA/TydiQA_GPT4_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import TyDiQADataset
-from llmebench.models import GPTChatCompletionModel
+from llmebench.models import OpenAIModel
 from llmebench.tasks import QATask
 
 
@@ -11,14 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": GPTChatCompletionModel,
+        "model": OpenAIModel,
         "model_args": {
-            "api_type": "azure",
-            "api_version": "2023-03-15-preview",
-            "api_base": os.environ["AZURE_API_URL"],
-            "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": os.environ["ENGINE_NAME"],
-            "class_labels": "NA",
             "max_tries": 50,
         },
         "general_args": {"data_path": "data/QA/tydiqa/tydiqa-goldp-dev-arabic.json"},

diff --git a/.../benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py → assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py b/.../benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py → assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py
@@ -1,7 +1,5 @@
-import os
-
 from llmebench.datasets import XQuADDataset
-from llmebench.models import BLOOMPetalModel
+from llmebench.models import PetalsModel
 from llmebench.tasks import QATask
 
 
@@ -11,9 +9,8 @@ def config():
         "dataset_args": {},
         "task": QATask,
         "task_args": {},
-        "model": BLOOMPetalModel,
+        "model": PetalsModel,
         "model_args": {
-            "api_url": os.environ["API_URL"],
             "max_tries": 5,
         },
         "general_args": {"data_path": "data/QA/xquad/xquad.ar.json"},
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,3 +21,6 @@ data @@
     # Temporary
     tmp
+    # Model configs
+    envs