Add metadata to all assets (#234)

This commit adds metadata to all assets, including author and model information, descriptions and scores. Tests and docs updated to reflect the same. * Add metadata to all assets * Update asset tests for metadata * Update asset tutorial
qcri · Sep 18, 2023 · f14121f · f14121f
1 parent 5814882
commit f14121f
Show file tree

Hide file tree

Showing 286 changed files with 2,541 additions and 11 deletions.
diff --git a/assets/ar/MT/AraBench_Ara2Eng_Helsinki_NLP_Opus_MT_ZeroShot.py b/assets/ar/MT/AraBench_Ara2Eng_Helsinki_NLP_Opus_MT_ZeroShot.py
@@ -3,6 +3,14 @@
 from llmebench.tasks import MachineTranslationTask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Helsinki-NLP/opus-mt-ar-en",
+        "description": "Sample HuggingFace Inference API asset for machine translation.",
+    }
+
+
 def config():
     return {
         "dataset": AraBenchDataset,

diff --git a/assets/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py
@@ -3,6 +3,14 @@
 from llmebench.tasks import MachineTranslationTask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "bloomz-176b (8bit quantized)",
+        "description": "Locally hosted BLOOMZ 176b model (8 bit quantized version) using the Petals.",
+    }
+
+
 def config():
     return {
         "dataset": AraBenchDataset,

diff --git a/assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py
@@ -3,6 +3,14 @@
 from llmebench.tasks import MachineTranslationTask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-35-turbo (version 0301)",
+        "description": "GPT35 model hosted on Azure, using the Completion API. API version '2023-03-15-preview'.",
+    }
+
+
 def config():
     return {
         "dataset": AraBenchDataset,

diff --git a/assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py b/assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py
@@ -3,6 +3,14 @@
 from llmebench.tasks import MachineTranslationTask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+    }
+
+
 def config():
     return {
         "dataset": AraBenchDataset,

diff --git a/assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py b/assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "bloomz-176b (8bit quantized)",
+        "description": "Locally hosted BLOOMZ 176b model (8 bit quantized version) using the Petals.",
+        "scores": {"F1": "0.368"},
+    }
+
+
 def config():
     return {
         "dataset": ARCDDataset,

diff --git a/assets/ar/QA/ARCD_GPT35_ZeroShot.py b/assets/ar/QA/ARCD_GPT35_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-35-turbo (version 0301)",
+        "description": "GPT35 model hosted on Azure, using the Completion API. API version '2023-03-15-preview'.",
+        "scores": {"F1": "0.502"},
+    }
+
+
 def config():
     return {
         "dataset": ARCDDataset,

diff --git a/assets/ar/QA/ARCD_GPT4_FewShot.py b/assets/ar/QA/ARCD_GPT4_FewShot.py
@@ -7,6 +7,15 @@
 random.seed(3333)
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+        "scores": {"F1": "0.704"},
+    }
+
+
 def config():
     return {
         "dataset": ARCDDataset,

diff --git a/assets/ar/QA/ARCD_GPT4_ZeroShot.py b/assets/ar/QA/ARCD_GPT4_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+        "scores": {"F1": "0.705"},
+    }
+
+
 def config():
     return {
         "dataset": ARCDDataset,

diff --git a/assets/ar/QA/ARCD_Random.py b/assets/ar/QA/ARCD_Random.py
@@ -5,6 +5,15 @@
 from llmebench.tasks import QATask, TaskType
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Random",
+        "description": "Random Baseline.",
+        "scores": {"F1": "0.085"},
+    }
+
+
 def config():
     return {
         "dataset": ARCDDataset,

diff --git a/assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py b/assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "bloomz-176b (8bit quantized)",
+        "description": "Locally hosted BLOOMZ 176b model (8 bit quantized version) using the Petals.",
+        "scores": {"F1": "0.377"},
+    }
+
+
 def config():
     return {
         "dataset": MLQADataset,

diff --git a/assets/ar/QA/MLQA_GPT35_ZeroShot.py b/assets/ar/QA/MLQA_GPT35_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-35-turbo (version 0301)",
+        "description": "GPT35 model hosted on Azure, using the Completion API. API version '2023-03-15-preview'.",
+        "scores": {"F1": "0.376"},
+    }
+
+
 def config():
     return {
         "dataset": MLQADataset,

diff --git a/assets/ar/QA/MLQA_GPT4_FewShot.py b/assets/ar/QA/MLQA_GPT4_FewShot.py
@@ -7,6 +7,15 @@
 random.seed(3333)
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+        "scores": {"F1": "0.653"},
+    }
+
+
 def config():
     return {
         "dataset": MLQADataset,

diff --git a/assets/ar/QA/MLQA_GPT4_ZeroShot.py b/assets/ar/QA/MLQA_GPT4_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+        "scores": {"F1": "0.620"},
+    }
+
+
 def config():
     return {
         "dataset": MLQADataset,

diff --git a/assets/ar/QA/MLQA_Random.py b/assets/ar/QA/MLQA_Random.py
@@ -5,6 +5,15 @@
 from llmebench.tasks import QATask, TaskType
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Random",
+        "description": "Random Baseline.",
+        "scores": {"F1": "0.066"},
+    }
+
+
 def config():
     return {
         "dataset": MLQADataset,

diff --git a/assets/ar/QA/MLQA_mdeberta_v3_base_squad2_ZeroShot.py b/assets/ar/QA/MLQA_mdeberta_v3_base_squad2_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "timpal0l/mdeberta-v3-base-squad2",
+        "description": "Sample HuggingFace Inference API asset for question answering.",
+        "scores": {"F1": "ar/QA/MLQA"},
+    }
+
+
 def config():
     return {
         "dataset": MLQADataset,

diff --git a/assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py b/assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "bloomz-176b (8bit quantized)",
+        "description": "Locally hosted BLOOMZ 176b model (8 bit quantized version) using the Petals.",
+        "scores": {"F1": "0.456"},
+    }
+
+
 def config():
     return {
         "dataset": TyDiQADataset,

diff --git a/assets/ar/QA/TyDiQA_GPT35_ZeroShot.py b/assets/ar/QA/TyDiQA_GPT35_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-35-turbo (version 0301)",
+        "description": "GPT35 model hosted on Azure, using the Completion API. API version '2023-03-15-preview'.",
+        "scores": {"F1": "0.480"},
+    }
+
+
 def config():
     return {
         "dataset": TyDiQADataset,

diff --git a/assets/ar/QA/TyDiQA_GPT4_FewShot.py b/assets/ar/QA/TyDiQA_GPT4_FewShot.py
@@ -7,6 +7,15 @@
 random.seed(3333)
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+        "scores": {"F1": "0.739"},
+    }
+
+
 def config():
     return {
         "dataset": TyDiQADataset,

diff --git a/assets/ar/QA/TyDiQA_GPT4_ZeroShot.py b/assets/ar/QA/TyDiQA_GPT4_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+        "scores": {"F1": "0.744"},
+    }
+
+
 def config():
     return {
         "dataset": TyDiQADataset,

diff --git a/assets/ar/QA/TyDiQA_Random.py b/assets/ar/QA/TyDiQA_Random.py
@@ -5,6 +5,15 @@
 from llmebench.tasks import QATask, TaskType
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Random",
+        "description": "Random Baseline.",
+        "scores": {"F1": "0.111"},
+    }
+
+
 def config():
     return {
         "dataset": TyDiQADataset,

diff --git a/assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py b/assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "bloomz-176b (8bit quantized)",
+        "description": "Locally hosted BLOOMZ 176b model (8 bit quantized version) using the Petals.",
+        "scores": {"F1": "0.367"},
+    }
+
+
 def config():
     return {
         "dataset": XQuADDataset,

diff --git a/assets/ar/QA/XQuAD_GPT35_ZeroShot.py b/assets/ar/QA/XQuAD_GPT35_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-35-turbo (version 0301)",
+        "description": "GPT35 model hosted on Azure, using the Completion API. API version '2023-03-15-preview'.",
+        "scores": {"F1": "0.442"},
+    }
+
+
 def config():
     return {
         "dataset": XQuADDataset,

diff --git a/assets/ar/QA/XQuAD_GPT4_FewShot.py b/assets/ar/QA/XQuAD_GPT4_FewShot.py
@@ -7,6 +7,15 @@
 random.seed(3333)
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'. 3 samples where chosen per test sample based on MaxMarginalRelevance for few shot learning.",
+        "scores": {"F1": "0.722"},
+    }
+
+
 def config():
     return {
         "dataset": XQuADDataset,

diff --git a/assets/ar/QA/XQuAD_GPT4_ZeroShot.py b/assets/ar/QA/XQuAD_GPT4_ZeroShot.py
@@ -3,6 +3,15 @@
 from llmebench.tasks import QATask
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "gpt-4-32k (version 0314)",
+        "description": "GPT4 32k tokens model hosted on Azure, using the ChatCompletion API. API version '2023-03-15-preview'.",
+        "scores": {"F1": "0.729"},
+    }
+
+
 def config():
     return {
         "dataset": XQuADDataset,

diff --git a/assets/ar/QA/XQuAD_Random.py b/assets/ar/QA/XQuAD_Random.py
@@ -5,6 +5,15 @@
 from llmebench.tasks import QATask, TaskType
 
 
+def metadata():
+    return {
+        "author": "Arabic Language Technologies, QCRI, HBKU",
+        "model": "Random",
+        "description": "Random Baseline.",
+        "scores": {"F1": "0.047"},
+    }
+
+
 def config():
     return {
         "dataset": XQuADDataset,