Reorganize assets and unify naming scheme (#191)

A new naming scheme roughly corresponding to `language_code/task_category/task/dataset_model_nshot.py` is introduced in this commit. Model/Dataset/Task names have also been standardized, along with removal of some duplicate assets. * Reorganize assets and unify naming scheme * Remove duplicate GPT4 propaganda asset * Remove duplicate GPT3.5 harmfulness detection asset * Remove duplicate GPT3.5 claim detection asset, and unify naming scheme for others * Add missing Lemmatization assets for BLOOMZ and GPT4 * Rename gender assets to remove redundant 'Gender' prefix * Rename CT22/CT23/Propaganda datasets * Fix language codes and format code * Fix HateSpeech and Offensive dataset names * Fix Parsing citations and dataset name * Fix ArSAS dataset name * Fix ADI dataset name * Fix default label type in Propaganda task * Fix incorrect Dataset in Harmful GPT4 asset * renamed datasets, assets and tasks for stance and fact. renamed datasets, assets and tasks for stance and fact. * Fix Lemmatization dataset name and citation * Format code * Fix Diacritization dataset name and citation * Add Dialectal Diacritization dataset and asset * Add GPT4 diacritization assets * Split segmentation assets across correct datasets * Split POS assets across correct datasets and add proper citations * Merged similar tasks and updated assets * Format code * Rename STS assets to match convention --------- Co-authored-by: maramhasanain <[email protected]>
qcri · Sep 6, 2023 · 6509529 · 6509529
1 parent 9c6e202
commit 6509529
Show file tree

Hide file tree

Showing 261 changed files with 2,175 additions and 1,198 deletions.
diff --git a/...v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py → ...s/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py b/...v1/MT/AraBench_Ara2Eng_BLOOMZ_ZeroShot.py → ...s/ar/MT/AraBench_ar2en_BLOOMZ_ZeroShot.py
diff --git a/...1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py → ...ts/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py b/...1/MT/AraBench_Ara2Eng_ChatGPT_ZeroShot.py → ...ts/ar/MT/AraBench_ar2en_GPT35_ZeroShot.py
diff --git a/.../MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py → assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py b/.../MT/AraBench_Ara2Eng_ChatGPT4_ZeroShot.py → assets/ar/MT/AraBench_ar2en_GPT4_ZeroShot.py
diff --git a/...s/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py → assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py b/...s/benchmark_v1/QA/ARCD_BLOOMZ_ZeroShot.py → assets/ar/QA/ARCD_BLOOMZ_ZeroShot.py
diff --git a/.../benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py → assets/ar/QA/ARCD_GPT35_ZeroShot.py b/.../benchmark_v1/QA/ARCD_ChatGPT_ZeroShot.py → assets/ar/QA/ARCD_GPT35_ZeroShot.py
diff --git a/...k_v1/QA/ARCD_GPTChatCompletion_FewShot.py → assets/ar/QA/ARCD_GPT4_FewShot.py b/...k_v1/QA/ARCD_GPTChatCompletion_FewShot.py → assets/ar/QA/ARCD_GPT4_FewShot.py
diff --git a/..._v1/QA/ARCD_GPTChatCompletion_ZeroShot.py → assets/ar/QA/ARCD_GPT4_ZeroShot.py b/..._v1/QA/ARCD_GPTChatCompletion_ZeroShot.py → assets/ar/QA/ARCD_GPT4_ZeroShot.py
diff --git a/...s/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py → assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py b/...s/benchmark_v1/QA/MLQA_BLOOMZ_ZeroShot.py → assets/ar/QA/MLQA_BLOOMZ_ZeroShot.py
diff --git a/.../benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py → assets/ar/QA/MLQA_GPT35_ZeroShot.py b/.../benchmark_v1/QA/MLQA_ChatGPT_ZeroShot.py → assets/ar/QA/MLQA_GPT35_ZeroShot.py
diff --git a/...k_v1/QA/MLQA_GPTChatCompletion_FewShot.py → assets/ar/QA/MLQA_GPT4_FewShot.py b/...k_v1/QA/MLQA_GPTChatCompletion_FewShot.py → assets/ar/QA/MLQA_GPT4_FewShot.py
diff --git a/..._v1/QA/MLQA_GPTChatCompletion_ZeroShot.py → assets/ar/QA/MLQA_GPT4_ZeroShot.py b/..._v1/QA/MLQA_GPTChatCompletion_ZeroShot.py → assets/ar/QA/MLQA_GPT4_ZeroShot.py
diff --git a/...benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py → assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py b/...benchmark_v1/QA/TyDiQA_BLOOMZ_ZeroShot.py → assets/ar/QA/TyDiQA_BLOOMZ_ZeroShot.py
diff --git a/...enchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py → assets/ar/QA/TyDiQA_GPT35_ZeroShot.py b/...enchmark_v1/QA/TyDiQA_ChatGPT_ZeroShot.py → assets/ar/QA/TyDiQA_GPT35_ZeroShot.py
diff --git a/...v1/QA/TyDiQA_GPTChatCompletion_FewShot.py → assets/ar/QA/TyDiQA_GPT4_FewShot.py b/...v1/QA/TyDiQA_GPTChatCompletion_FewShot.py → assets/ar/QA/TyDiQA_GPT4_FewShot.py
diff --git a/...1/QA/TydiQA_GPTChatCompletion_ZeroShot.py → assets/ar/QA/TydiQA_GPT4_ZeroShot.py b/...1/QA/TydiQA_GPTChatCompletion_ZeroShot.py → assets/ar/QA/TydiQA_GPT4_ZeroShot.py
diff --git a/.../benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py → assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py b/.../benchmark_v1/QA/XQuAD_BLOOMZ_ZeroShot.py → assets/ar/QA/XQuAD_BLOOMZ_ZeroShot.py
diff --git a/...benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py → assets/ar/QA/XQuAD_GPT35_ZeroShot.py b/...benchmark_v1/QA/XQuAD_ChatGPT_ZeroShot.py → assets/ar/QA/XQuAD_GPT35_ZeroShot.py
diff --git a/...v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py → assets/ar/QA/XQuAD_GPT4_ZeroShot.py b/...v1/QA/XQuAD_GPTChatCompletion_ZeroShot.py → assets/ar/QA/XQuAD_GPT4_ZeroShot.py
diff --git a/..._v1/QA/XQuaD_GPTChatCompletion_FewShot.py → assets/ar/QA/XQuaD_GPT4_FewShot.py b/..._v1/QA/XQuaD_GPTChatCompletion_FewShot.py → assets/ar/QA/XQuaD_GPT4_FewShot.py
diff --git a/.../gender/GenderArabGend_BLOOMZ_ZeroShot.py → ...ibutes/gender/ArabGend_BLOOMZ_ZeroShot.py b/.../gender/GenderArabGend_BLOOMZ_ZeroShot.py → ...ibutes/gender/ArabGend_BLOOMZ_ZeroShot.py
diff --git a/...gender/GenderArabGend_ChatGPT_ZeroShot.py → ...ributes/gender/ArabGend_GPT35_ZeroShot.py b/...gender/GenderArabGend_ChatGPT_ZeroShot.py → ...ributes/gender/ArabGend_GPT35_ZeroShot.py
diff --git a/...derArabGend_GPTChatCompletion_ZeroShot.py → ...tributes/gender/ArabGend_GPT4_ZeroShot.py b/...derArabGend_GPTChatCompletion_ZeroShot.py → ...tributes/gender/ArabGend_GPT4_ZeroShot.py
diff --git a/...gender/GenderArapTweet_BLOOMZ_ZeroShot.py → ...butes/gender/ArapTweet_BLOOMZ_ZeroShot.py b/...gender/GenderArapTweet_BLOOMZ_ZeroShot.py → ...butes/gender/ArapTweet_BLOOMZ_ZeroShot.py
diff --git a/...ender/GenderArapTweet_ChatGPT_ZeroShot.py → ...ibutes/gender/ArapTweet_GPT35_ZeroShot.py b/...ender/GenderArapTweet_ChatGPT_ZeroShot.py → ...ibutes/gender/ArapTweet_GPT35_ZeroShot.py
diff --git a/...derArapTweet_GPTChatCompletion_FewShot.py → ...tributes/gender/ArapTweet_GPT4_FewShot.py b/...derArapTweet_GPTChatCompletion_FewShot.py → ...tributes/gender/ArapTweet_GPT4_FewShot.py
diff --git a/...erArapTweet_GPTChatCompletion_ZeroShot.py → ...ributes/gender/ArapTweet_GPT4_ZeroShot.py b/...erArapTweet_GPTChatCompletion_ZeroShot.py → ...ributes/gender/ArapTweet_GPT4_ZeroShot.py
diff --git a/...aphy/location/Location_BLOOMZ_ZeroShot.py → ...utes/location/Location_BLOOMZ_ZeroShot.py b/...aphy/location/Location_BLOOMZ_ZeroShot.py → ...utes/location/Location_BLOOMZ_ZeroShot.py
diff --git a/...phy/location/Location_ChatGPT_ZeroShot.py → ...butes/location/Location_GPT35_ZeroShot.py b/...phy/location/Location_ChatGPT_ZeroShot.py → ...butes/location/Location_GPT35_ZeroShot.py
diff --git a/...ion/Location_GPTChatCompletion_FewShot.py → ...ributes/location/Location_GPT4_FewShot.py b/...ion/Location_GPTChatCompletion_FewShot.py → ...ributes/location/Location_GPT4_FewShot.py
diff --git a/...on/Location_GPTChatCompletion_ZeroShot.py → ...ibutes/location/Location_GPT4_ZeroShot.py b/...on/Location_GPTChatCompletion_ZeroShot.py → ...ibutes/location/Location_GPT4_ZeroShot.py
diff --git a/...phy/name_info/NameInfo_BLOOMZ_ZeroShot.py → ...tes/name_info/NameInfo_BLOOMZ_ZeroShot.py b/...phy/name_info/NameInfo_BLOOMZ_ZeroShot.py → ...tes/name_info/NameInfo_BLOOMZ_ZeroShot.py
diff --git a/...hy/name_info/NameInfo_ChatGPT_ZeroShot.py → ...utes/name_info/NameInfo_GPT35_ZeroShot.py b/...hy/name_info/NameInfo_ChatGPT_ZeroShot.py → ...utes/name_info/NameInfo_GPT35_ZeroShot.py
diff --git a/...nfo/NameInfo_GPTChatCompletion_FewShot.py → ...ibutes/name_info/NameInfo_GPT4_FewShot.py b/...nfo/NameInfo_GPTChatCompletion_FewShot.py → ...ibutes/name_info/NameInfo_GPT4_FewShot.py
diff --git a/...fo/NameInfo_GPTChatCompletion_ZeroShot.py → ...butes/name_info/NameInfo_GPT4_ZeroShot.py b/...fo/NameInfo_GPTChatCompletion_ZeroShot.py → ...butes/name_info/NameInfo_GPT4_ZeroShot.py
diff --git a/..._harmful_content/Adult_BLOOMZ_ZeroShot.py → ...ontent_detection/Adult_BLOOMZ_ZeroShot.py b/..._harmful_content/Adult_BLOOMZ_ZeroShot.py → ...ontent_detection/Adult_BLOOMZ_ZeroShot.py
diff --git a/...harmful_content/Adult_ChatGPT_ZeroShot.py → ...content_detection/Adult_GPT35_ZeroShot.py b/...harmful_content/Adult_ChatGPT_ZeroShot.py → ...content_detection/Adult_GPT35_ZeroShot.py
diff --git a/...ontent/Adult_GPTChatCompletion_FewShot.py → ...t_content_detection/Adult_GPT4_FewShot.py b/...ontent/Adult_GPTChatCompletion_FewShot.py → ...t_content_detection/Adult_GPT4_FewShot.py
diff --git a/...ntent/Adult_GPTChatCompletion_ZeroShot.py → ..._content_detection/Adult_GPT4_ZeroShot.py b/...ntent/Adult_GPTChatCompletion_ZeroShot.py → ..._content_detection/Adult_GPT4_ZeroShot.py
diff --git a/...ontent/Attentionworthy_BLOOMZ_ZeroShot.py → ...hy/CT22Attentionworthy_BLOOMZ_ZeroShot.py b/...ontent/Attentionworthy_BLOOMZ_ZeroShot.py → ...hy/CT22Attentionworthy_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import AttentionworthyDataset
+from llmebench.datasets import CT22AttentionworthyDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import AttentionworthyTask
 
 
 def config():
     return {
-        "dataset": AttentionworthyDataset,
+        "dataset": CT22AttentionworthyDataset,
         "dataset_args": {},
         "task": AttentionworthyTask,
         "task_args": {},

diff --git a/...ntent/Attentionworthy_ChatGPT_ZeroShot.py → ...thy/CT22Attentionworthy_GPT35_ZeroShot.py b/...ntent/Attentionworthy_ChatGPT_ZeroShot.py → ...thy/CT22Attentionworthy_GPT35_ZeroShot.py
@@ -1,14 +1,14 @@
 import os
 import re
 
-from llmebench.datasets import AttentionworthyDataset
-from llmebench.models import GPTModel, RandomGPTModel
+from llmebench.datasets import CT22AttentionworthyDataset
+from llmebench.models import GPTModel
 from llmebench.tasks import AttentionworthyTask
 
 
 def config():
     return {
-        "dataset": AttentionworthyDataset,
+        "dataset": CT22AttentionworthyDataset,
         "dataset_args": {},
         "task": AttentionworthyTask,
         "task_args": {},

diff --git a/...entionworthy_GPTChatCompletion_Fewshot.py → ...orthy/CT22Attentionworthy_GPT4_FewShot.py b/...entionworthy_GPTChatCompletion_Fewshot.py → ...orthy/CT22Attentionworthy_GPT4_FewShot.py
@@ -1,18 +1,13 @@
 import os
-import random
-import re
 
-from llmebench.datasets import AttentionworthyDataset
+from llmebench.datasets import CT22AttentionworthyDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import AttentionworthyTask
 
 
-random.seed(1333)
-
-
 def config():
     return {
-        "dataset": AttentionworthyDataset,
+        "dataset": CT22AttentionworthyDataset,
         "dataset_args": {},
         "task": AttentionworthyTask,
         "task_args": {},

diff --git a/...ntionworthy_GPTChatCompletion_ZeroShot.py → ...rthy/CT22Attentionworthy_GPT4_ZeroShot.py b/...ntionworthy_GPTChatCompletion_ZeroShot.py → ...rthy/CT22Attentionworthy_GPT4_ZeroShot.py
@@ -1,18 +1,13 @@
 import os
-import random
-import re
 
-from llmebench.datasets import AttentionworthyDataset
+from llmebench.datasets import CT22AttentionworthyDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import AttentionworthyTask
 
 
-random.seed(1333)
-
-
 def config():
     return {
-        "dataset": AttentionworthyDataset,
+        "dataset": CT22AttentionworthyDataset,
         "dataset_args": {},
         "task": AttentionworthyTask,
         "task_args": {},

diff --git a/...ontent/Checkworthiness_BLOOMZ_ZeroShot.py → ...ss/CT22Checkworthiness_BLOOMZ_ZeroShot.py b/...ontent/Checkworthiness_BLOOMZ_ZeroShot.py → ...ss/CT22Checkworthiness_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},

diff --git a/...ntent/Checkworthiness_ChatGPT_ZeroShot.py → ...ess/CT22Checkworthiness_GPT35_ZeroShot.py b/...ntent/Checkworthiness_ChatGPT_ZeroShot.py → ...ess/CT22Checkworthiness_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTModel, RandomGPTModel
 from llmebench.tasks import CheckworthinessTask
 
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},

diff --git a/...ckworthiness_GPTChatCompletion_FewShot.py → ...yness/CT22Checkworthiness_GPT4_FewShot.py b/...ckworthiness_GPTChatCompletion_FewShot.py → ...yness/CT22Checkworthiness_GPT4_FewShot.py
@@ -2,7 +2,7 @@
 import random
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
@@ -12,7 +12,7 @@
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},

diff --git a/...kworthiness_GPTChatCompletion_ZeroShot.py → ...ness/CT22Checkworthiness_GPT4_ZeroShot.py b/...kworthiness_GPTChatCompletion_ZeroShot.py → ...ness/CT22Checkworthiness_GPT4_ZeroShot.py
@@ -2,7 +2,7 @@
 import random
 import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
@@ -12,7 +12,7 @@
 
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},

diff --git a/...content/COVClaimDetect_BLOOMZ_ZeroShot.py → ...im_detection/CT22Claim_BLOOMZ_ZeroShot.py b/...content/COVClaimDetect_BLOOMZ_ZeroShot.py → ...im_detection/CT22Claim_BLOOMZ_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidClaimDataset
+from llmebench.datasets import CT22ClaimDataset
 from llmebench.models import BLOOMPetalModel
 from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidClaimDataset,
+        "dataset": CT22ClaimDataset,
         "dataset_args": {},
         "task": ClaimDetectionTask,
         "task_args": {},

diff --git a/...ent/ClaimDetectCOVID19_CGPT35_ZeroShot.py → ...aim_detection/CT22Claim_GPT35_ZeroShot.py b/...ent/ClaimDetectCOVID19_CGPT35_ZeroShot.py → ...aim_detection/CT22Claim_GPT35_ZeroShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidClaimDataset
+from llmebench.datasets import CT22ClaimDataset
 from llmebench.models import GPTModel
 from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidClaimDataset,
+        "dataset": CT22ClaimDataset,
         "dataset_args": {},
         "task": ClaimDetectionTask,
         "task_args": {},

diff --git a/...ul_content/COVClaimDetect_GPT4_FewShot.py → ...claim_detection/CT22Claim_GPT4_FewShot.py b/...ul_content/COVClaimDetect_GPT4_FewShot.py → ...claim_detection/CT22Claim_GPT4_FewShot.py
@@ -1,13 +1,13 @@
 import os
 
-from llmebench.datasets import CovidClaimDataset
+from llmebench.datasets import CT22ClaimDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import ClaimDetectionTask
 
 
 def config():
     return {
-        "dataset": CovidClaimDataset,
+        "dataset": CT22ClaimDataset,
         "dataset_args": {},
         "task": ClaimDetectionTask,
         "task_args": {},

diff --git a/...tectCOVID19_GPTChatCompletion_ZeroShot.py → ...laim_detection/CT22Claim_GPT4_ZeroShot.py b/...tectCOVID19_GPTChatCompletion_ZeroShot.py → ...laim_detection/CT22Claim_GPT4_ZeroShot.py
@@ -1,18 +1,13 @@
 import os
-import random
-import re
 
-from llmebench.datasets import CheckworthinessDataset
+from llmebench.datasets import CT22CheckworthinessDataset
 from llmebench.models import GPTChatCompletionModel
 from llmebench.tasks import CheckworthinessTask
 
 
-random.seed(1333)
-
-
 def config():
     return {
-        "dataset": CheckworthinessDataset,
+        "dataset": CT22CheckworthinessDataset,
         "dataset_args": {},
         "task": CheckworthinessTask,
         "task_args": {},

diff --git a/...tent/FactualityCOVID19_BLOOMZ_ZeroShot.py → ...lity/COVID19Factuality_BLOOMZ_ZeroShot.py b/...tent/FactualityCOVID19_BLOOMZ_ZeroShot.py → ...lity/COVID19Factuality_BLOOMZ_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.datasets import COVID19FactualityDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import FactualityCOVID19Task
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityCOVID19Dataset,
+        "dataset": COVID19FactualityDataset,
         "dataset_args": {},
-        "task": FactualityCOVID19Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {

diff --git a/...alityCOVID19_GPTChatCompletion_FewShot.py → ...tuality/COVID19Factuality_GPT4_FewShot.py b/...alityCOVID19_GPTChatCompletion_FewShot.py → ...tuality/COVID19Factuality_GPT4_FewShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.datasets import COVID19FactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityCOVID19Task
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityCOVID19Dataset,
+        "dataset": COVID19FactualityDataset,
         "dataset_args": {},
-        "task": FactualityCOVID19Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {

diff --git a/...lityCOVID19_GPTChatCompletion_ZeroShot.py → ...uality/COVID19Factuality_GPT4_ZeroShot.py b/...lityCOVID19_GPTChatCompletion_ZeroShot.py → ...uality/COVID19Factuality_GPT4_ZeroShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityCOVID19Dataset
+from llmebench.datasets import COVID19FactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityCOVID19Task
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityCOVID19Dataset,
+        "dataset": COVID19FactualityDataset,
         "dataset_args": {},
-        "task": FactualityCOVID19Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {

diff --git a/...ent/FactualityKhouja20_BLOOMZ_ZeroShot.py → ...ity/Khouja20Factuality_BLOOMZ_ZeroShot.py b/...ent/FactualityKhouja20_BLOOMZ_ZeroShot.py → ...ity/Khouja20Factuality_BLOOMZ_ZeroShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.datasets import Khouja20FactualityDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import FactualityKhouja20Task
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityKhouja20Dataset,
+        "dataset": Khouja20FactualityDataset,
         "dataset_args": {},
-        "task": FactualityKhouja20Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {

diff --git a/...nt/FactualityKhouja20_ChatGPT_ZeroShot.py → ...lity/Khouja20Factuality_GPT35_ZeroShot.py b/...nt/FactualityKhouja20_ChatGPT_ZeroShot.py → ...lity/Khouja20Factuality_GPT35_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityKhouja20Dataset
-from llmebench.models import GPTModel, RandomGPTModel
-from llmebench.tasks import FactualityKhouja20Task
+from llmebench.datasets import Khouja20FactualityDataset
+from llmebench.models import GPTModel
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityKhouja20Dataset,
+        "dataset": Khouja20FactualityDataset,
         "dataset_args": {},
-        "task": FactualityKhouja20Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTModel,
         "model_args": {

diff --git a/...ontent/FactualityKhouja20_GPT4_FewShot.py → ...uality/Khouja20Factuality_GPT4_FewShot.py b/...ontent/FactualityKhouja20_GPT4_FewShot.py → ...uality/Khouja20Factuality_GPT4_FewShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.datasets import Khouja20FactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityKhouja20Task
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityKhouja20Dataset,
+        "dataset": Khouja20FactualityDataset,
         "dataset_args": {},
-        "task": FactualityKhouja20Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {

diff --git a/...ntent/FactualityKhouja20_GPT4_ZeroShot.py → ...ality/Khouja20Factuality_GPT4_ZeroShot.py b/...ntent/FactualityKhouja20_GPT4_ZeroShot.py → ...ality/Khouja20Factuality_GPT4_ZeroShot.py
@@ -1,15 +1,15 @@
 import os
 
-from llmebench.datasets import FactualityKhouja20Dataset
+from llmebench.datasets import Khouja20FactualityDataset
 from llmebench.models import GPTChatCompletionModel
-from llmebench.tasks import FactualityKhouja20Task
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityKhouja20Dataset,
+        "dataset": Khouja20FactualityDataset,
         "dataset_args": {},
-        "task": FactualityKhouja20Task,
+        "task": FactualityTask,
         "task_args": {},
         "model": GPTChatCompletionModel,
         "model_args": {

diff --git a/...nt/FactualityUnifiedFC_BLOOMZ_ZeroShot.py → ...ty/UnifiedFCFactuality_BLOOMZ_ZeroShot.py b/...nt/FactualityUnifiedFC_BLOOMZ_ZeroShot.py → ...ty/UnifiedFCFactuality_BLOOMZ_ZeroShot.py
@@ -1,20 +1,15 @@
 import os
-import random
-import re
 
-from llmebench.datasets import FactualityUnifiedFCDataset
+from llmebench.datasets import UnifiedFCFactualityDataset
 from llmebench.models import BLOOMPetalModel
-from llmebench.tasks import FactualityUnifiedFCTask
-
-
-random.seed(1333)
+from llmebench.tasks import FactualityTask
 
 
 def config():
     return {
-        "dataset": FactualityUnifiedFCDataset,
+        "dataset": UnifiedFCFactualityDataset,
         "dataset_args": {},
-        "task": FactualityUnifiedFCTask,
+        "task": FactualityTask,
         "task_args": {},
         "model": BLOOMPetalModel,
         "model_args": {