diff --git a/assets/ar/QA/ARCD_Random.py b/assets/ar/QA/ARCD_Random.py new file mode 100644 index 00000000..0aa06f6d --- /dev/null +++ b/assets/ar/QA/ARCD_Random.py @@ -0,0 +1,30 @@ +import random + +from llmebench.datasets import ARCDDataset +from llmebench.models import RandomModel +from llmebench.tasks import QATask, TaskType + + +def config(): + return { + "dataset": ARCDDataset, + "dataset_args": {}, + "task": QATask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.QuestionAnswering}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + tokens = response["random_response"]["context"].split(" ") + + start_idx = random.choice(range(len(tokens))) + answer_length = random.choice(range(len(tokens) - start_idx)) + + return " ".join(tokens[start_idx : start_idx + answer_length]) diff --git a/assets/ar/QA/MLQA_Random.py b/assets/ar/QA/MLQA_Random.py new file mode 100644 index 00000000..083b7345 --- /dev/null +++ b/assets/ar/QA/MLQA_Random.py @@ -0,0 +1,30 @@ +import random + +from llmebench.datasets import MLQADataset +from llmebench.models import RandomModel +from llmebench.tasks import QATask, TaskType + + +def config(): + return { + "dataset": MLQADataset, + "dataset_args": {}, + "task": QATask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.QuestionAnswering}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + tokens = response["random_response"]["context"].split(" ") + + start_idx = random.choice(range(len(tokens))) + answer_length = random.choice(range(len(tokens) - start_idx)) + + return " ".join(tokens[start_idx : start_idx + answer_length]) diff --git a/assets/ar/QA/TyDiQA_Random.py b/assets/ar/QA/TyDiQA_Random.py new file mode 100644 index 00000000..6046216a --- /dev/null +++ b/assets/ar/QA/TyDiQA_Random.py @@ -0,0 +1,30 @@ +import random + +from llmebench.datasets import TyDiQADataset +from llmebench.models import RandomModel +from llmebench.tasks import QATask, TaskType + + +def config(): + return { + "dataset": TyDiQADataset, + "dataset_args": {}, + "task": QATask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.QuestionAnswering}, + "general_args": {"test_split": "dev"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + tokens = response["random_response"]["context"].split(" ") + + start_idx = random.choice(range(len(tokens))) + answer_length = random.choice(range(len(tokens) - start_idx)) + + return " ".join(tokens[start_idx : start_idx + answer_length]) diff --git a/assets/ar/QA/XQuAD_Random.py b/assets/ar/QA/XQuAD_Random.py new file mode 100644 index 00000000..665c23d5 --- /dev/null +++ b/assets/ar/QA/XQuAD_Random.py @@ -0,0 +1,30 @@ +import random + +from llmebench.datasets import XQuADDataset +from llmebench.models import RandomModel +from llmebench.tasks import QATask, TaskType + + +def config(): + return { + "dataset": XQuADDataset, + "dataset_args": {}, + "task": QATask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.QuestionAnswering}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + tokens = response["random_response"]["context"].split(" ") + + start_idx = random.choice(range(len(tokens))) + answer_length = random.choice(range(len(tokens) - start_idx)) + + return " ".join(tokens[start_idx : start_idx + answer_length]) diff --git a/assets/ar/demographic_attributes/gender/ArabGend_Random.py b/assets/ar/demographic_attributes/gender/ArabGend_Random.py new file mode 100644 index 00000000..c8c5a28a --- /dev/null +++ b/assets/ar/demographic_attributes/gender/ArabGend_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import ArabGendDataset +from llmebench.models import RandomModel +from llmebench.tasks import DemographyGenderTask, TaskType + + +def config(): + return { + "dataset": ArabGendDataset, + "dataset_args": {}, + "task": DemographyGenderTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["m", "f"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/demographic_attributes/gender/ArapTweet_Random.py b/assets/ar/demographic_attributes/gender/ArapTweet_Random.py new file mode 100644 index 00000000..1a5dc352 --- /dev/null +++ b/assets/ar/demographic_attributes/gender/ArapTweet_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import ArapTweetDataset +from llmebench.models import RandomModel +from llmebench.tasks import DemographyGenderTask, TaskType + + +def config(): + return { + "dataset": ArapTweetDataset, + "dataset_args": {}, + "task": DemographyGenderTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["Female", "Male"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/demographic_attributes/location/Location_Random.py b/assets/ar/demographic_attributes/location/Location_Random.py new file mode 100644 index 00000000..86acd1bd --- /dev/null +++ b/assets/ar/demographic_attributes/location/Location_Random.py @@ -0,0 +1,49 @@ +from llmebench.datasets import LocationDataset +from llmebench.models import RandomModel +from llmebench.tasks import DemographyLocationTask, TaskType + + +def config(): + return { + "dataset": LocationDataset, + "dataset_args": {}, + "task": DemographyLocationTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "ae", + "OTHERS", + "bh", + "dz", + "eg", + "iq", + "jo", + "kw", + "lb", + "ly", + "ma", + "om", + "ps", + "qa", + "sa", + "sd", + "so", + "sy", + "tn", + "UNK", + "ye", + "mr", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/demographic_attributes/name_info/NameInfo_Random.py b/assets/ar/demographic_attributes/name_info/NameInfo_Random.py new file mode 100644 index 00000000..abd6011d --- /dev/null +++ b/assets/ar/demographic_attributes/name_info/NameInfo_Random.py @@ -0,0 +1,127 @@ +from llmebench.datasets import NameInfoDataset +from llmebench.models import RandomModel +from llmebench.tasks import DemographyNameInfoTask, TaskType + + +def config(): + return { + "dataset": NameInfoDataset, + "dataset_args": {}, + "task": DemographyNameInfoTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "gb", + "us", + "cl", + "fr", + "ru", + "pl", + "in", + "it", + "kr", + "gh", + "ca", + "sa", + "at", + "de", + "cn", + "br", + "dk", + "se", + "bd", + "cu", + "jp", + "be", + "es", + "co", + "id", + "iq", + "pk", + "tr", + "il", + "ch", + "ar", + "ro", + "nl", + "ps", + "ug", + "ir", + "cg", + "do", + "ee", + "tn", + "gr", + "np", + "ie", + "sy", + "hu", + "eg", + "ma", + "ve", + "ph", + "no", + "bg", + "si", + "ke", + "au", + "et", + "py", + "af", + "pt", + "th", + "bo", + "mx", + "lb", + "za", + "fi", + "hr", + "vn", + "ly", + "nz", + "qa", + "kh", + "ci", + "ng", + "sg", + "cm", + "dz", + "tz", + "ae", + "pe", + "az", + "lu", + "ec", + "cz", + "ua", + "uy", + "sd", + "ao", + "my", + "lv", + "kw", + "tw", + "bh", + "lk", + "ye", + "cr", + "jo", + "pa", + "om", + "uz", + "by", + "kz", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_Random.py b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_Random.py new file mode 100644 index 00000000..2ec54c45 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/adult_content_detection/Adult_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import AdultDataset +from llmebench.models import RandomModel +from llmebench.tasks import AdultTask, TaskType + + +def config(): + return { + "dataset": AdultDataset, + "dataset_args": {}, + "task": AdultTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["ADULT", "NOT_ADULT"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_Random.py b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_Random.py new file mode 100644 index 00000000..48840891 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/attentionworthy/CT22Attentionworthy_Random.py @@ -0,0 +1,36 @@ +from llmebench.datasets import CT22AttentionworthyDataset +from llmebench.models import RandomModel +from llmebench.tasks import AttentionworthyTask, TaskType + + +def config(): + return { + "dataset": CT22AttentionworthyDataset, + "dataset_args": {}, + "task": AttentionworthyTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "yes_discusses_action_taken", + "harmful", + "yes_discusses_cure", + "yes_asks_question", + "no_not_interesting", + "yes_other", + "yes_blame_authorities", + "yes_contains_advice", + "yes_calls_for_action", + ], + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py new file mode 100644 index 00000000..7c24403e --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import RandomModel +from llmebench.tasks import CheckworthinessTask, TaskType + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py index cf07aae4..bc960c5c 100644 --- a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_GPT4_ZeroShot.py @@ -1,13 +1,13 @@ from llmebench.datasets import CT22ClaimDataset from llmebench.models import OpenAIModel -from llmebench.tasks import CheckworthinessTask +from llmebench.tasks import ClaimDetectionTask def config(): return { "dataset": CT22ClaimDataset, "dataset_args": {}, - "task": CheckworthinessTask, + "task": ClaimDetectionTask, "task_args": {}, "model": OpenAIModel, "model_args": { diff --git a/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_Random.py b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_Random.py new file mode 100644 index 00000000..b36d1705 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/claim_detection/CT22Claim_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT22ClaimDataset +from llmebench.models import RandomModel +from llmebench.tasks import ClaimDetectionTask, TaskType + + +def config(): + return { + "dataset": CT22ClaimDataset, + "dataset_args": {}, + "task": ClaimDetectionTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Random.py b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Random.py new file mode 100644 index 00000000..963e47c0 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/ANSFactuality_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import ANSFactualityDataset +from llmebench.models import RandomModel +from llmebench.tasks import FactualityTask, TaskType + + +def config(): + return { + "dataset": ANSFactualityDataset, + "dataset_args": {}, + "task": FactualityTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["true", "false"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_Random.py b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_Random.py new file mode 100644 index 00000000..0d83890d --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/COVID19Factuality_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import COVID19FactualityDataset +from llmebench.models import RandomModel +from llmebench.tasks import FactualityTask, TaskType + + +def config(): + return { + "dataset": COVID19FactualityDataset, + "dataset_args": {}, + "task": FactualityTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["yes", "no"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_Random.py b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_Random.py new file mode 100644 index 00000000..c48262e7 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/factuality/UnifiedFCFactuality_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import UnifiedFCFactualityDataset +from llmebench.models import RandomModel +from llmebench.tasks import FactualityTask, TaskType + + +def config(): + return { + "dataset": UnifiedFCFactualityDataset, + "dataset_args": {}, + "task": FactualityTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["true", "false"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py index 17587c9d..35e6bc48 100644 --- a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py +++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_GPT4_ZeroShot.py @@ -2,14 +2,14 @@ from llmebench.datasets import CT22HarmfulDataset from llmebench.models import OpenAIModel -from llmebench.tasks import CheckworthinessTask +from llmebench.tasks import HarmfulDetectionTask def config(): return { "dataset": CT22HarmfulDataset, "dataset_args": {}, - "task": CheckworthinessTask, + "task": HarmfulDetectionTask, "task_args": {}, "model": OpenAIModel, "model_args": { diff --git a/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Random.py b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Random.py new file mode 100644 index 00000000..78350822 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/harmfulness_detection/CT22Harmful_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT22HarmfulDataset +from llmebench.models import RandomModel +from llmebench.tasks import HarmfulDetectionTask, TaskType + + +def config(): + return { + "dataset": CT22HarmfulDataset, + "dataset_args": {}, + "task": HarmfulDetectionTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {"test_split": "ar"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_Random.py b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_Random.py new file mode 100644 index 00000000..9c312bfc --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/hate_speech/OSACT4SubtaskB_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import OSACT4SubtaskBDataset +from llmebench.models import RandomModel +from llmebench.tasks import HateSpeechTask, TaskType + + +def config(): + return { + "dataset": OSACT4SubtaskBDataset, + "dataset_args": {}, + "task": HateSpeechTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["HS", "NOT_HS"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_Random.py b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_Random.py new file mode 100644 index 00000000..64b4b248 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/offensive_language/OSACT4SubtaskA_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import OSACT4SubtaskADataset +from llmebench.models import RandomModel +from llmebench.tasks import OffensiveTask, TaskType + + +def config(): + return { + "dataset": OSACT4SubtaskADataset, + "dataset_args": {}, + "task": OffensiveTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["OFF", "NOT_OFF"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_Random.py b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_Random.py new file mode 100644 index 00000000..7862b52c --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/propaganda/WANLP22T3_Random.py @@ -0,0 +1,50 @@ +from llmebench.datasets import WANLP22T3PropagandaDataset +from llmebench.models import RandomModel +from llmebench.tasks import MultilabelPropagandaTask, TaskType + + +def config(): + return { + "dataset": WANLP22T3PropagandaDataset, + "dataset_args": {"techniques_path": "classes.txt"}, + "task": MultilabelPropagandaTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "no technique", + "Smears", + "Exaggeration/Minimisation", + "Loaded Language", + "Appeal to fear/prejudice", + "Name calling/Labeling", + "Slogans", + "Repetition", + "Doubt", + "Obfuscation, Intentional vagueness, Confusion", + "Flag-waving", + "Glittering generalities (Virtue)", + "Misrepresentation of Someone's Position (Straw Man)", + "Presenting Irrelevant Data (Red Herring)", + "Appeal to authority", + "Whataboutism", + "Black-and-white Fallacy/Dictatorship", + "Thought-terminating cliché", + "Causal Oversimplification", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + random_labels = response["random_response"] + if "no technique" in random_labels: + return ["no technique"] + else: + return random_labels diff --git a/assets/ar/factuality_disinformation_harmful_content/spam/Spam_Random.py b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_Random.py new file mode 100644 index 00000000..fa72aa5f --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/spam/Spam_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import SpamDataset +from llmebench.models import RandomModel +from llmebench.tasks import SpamTask, TaskType + + +def config(): + return { + "dataset": SpamDataset, + "dataset_args": {}, + "task": SpamTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["__label__ADS", "__label__NOTADS"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_Random.py b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_Random.py new file mode 100644 index 00000000..00ca95e0 --- /dev/null +++ b/assets/ar/factuality_disinformation_harmful_content/subjectivity/CT23Subjectivity_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT23SubjectivityDataset +from llmebench.models import RandomModel +from llmebench.tasks import SubjectivityTask, TaskType + + +def config(): + return { + "dataset": CT23SubjectivityDataset, + "dataset_args": {}, + "task": SubjectivityTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["SUBJ", "OBJ"], + }, + "general_args": {"test_split": "ar/dev"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py b/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py index c87f0f8b..291d1e11 100644 --- a/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py +++ b/assets/ar/news_categorization/ASND_GPT35_ZeroShot.py @@ -8,7 +8,7 @@ def config(): "dataset": ASNDDataset, "dataset_args": {}, "task": NewsCategorizationTask, - "task_args": {"test": "useless"}, + "task_args": {}, "model": LegacyOpenAIModel, "model_args": { "class_labels": [ diff --git a/assets/ar/news_categorization/ASND_Random.py b/assets/ar/news_categorization/ASND_Random.py new file mode 100644 index 00000000..caf8e500 --- /dev/null +++ b/assets/ar/news_categorization/ASND_Random.py @@ -0,0 +1,39 @@ +from llmebench.datasets import ASNDDataset +from llmebench.models import RandomModel +from llmebench.tasks import NewsCategorizationTask, TaskType + + +def config(): + return { + "dataset": ASNDDataset, + "dataset_args": {}, + "task": NewsCategorizationTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "crime-war-conflict", + "spiritual", + "health", + "politics", + "human-rights-press-freedom", + "education", + "business-and-economy", + "art-and-entertainment", + "others", + "science-and-technology", + "sports", + "environment", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/news_categorization/SANADAkhbarona_Random.py b/assets/ar/news_categorization/SANADAkhbarona_Random.py new file mode 100644 index 00000000..5db05131 --- /dev/null +++ b/assets/ar/news_categorization/SANADAkhbarona_Random.py @@ -0,0 +1,34 @@ +from llmebench.datasets import SANADAkhbaronaDataset +from llmebench.models import RandomModel +from llmebench.tasks import NewsCategorizationTask, TaskType + + +def config(): + return { + "dataset": SANADAkhbaronaDataset, + "dataset_args": {}, + "task": NewsCategorizationTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "politics", + "religion", + "medical", + "sports", + "tech", + "finance", + "culture", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/news_categorization/SANADAlArabiya_Random.py b/assets/ar/news_categorization/SANADAlArabiya_Random.py new file mode 100644 index 00000000..cc55e7b5 --- /dev/null +++ b/assets/ar/news_categorization/SANADAlArabiya_Random.py @@ -0,0 +1,33 @@ +from llmebench.datasets import SANADAlArabiyaDataset +from llmebench.models import RandomModel +from llmebench.tasks import NewsCategorizationTask, TaskType + + +def config(): + return { + "dataset": SANADAlArabiyaDataset, + "dataset_args": {}, + "task": NewsCategorizationTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "politics", + "medical", + "sports", + "tech", + "finance", + "culture", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/news_categorization/SANADAlKhaleej_Random.py b/assets/ar/news_categorization/SANADAlKhaleej_Random.py new file mode 100644 index 00000000..76bd7793 --- /dev/null +++ b/assets/ar/news_categorization/SANADAlKhaleej_Random.py @@ -0,0 +1,34 @@ +from llmebench.datasets import SANADAlKhaleejDataset +from llmebench.models import RandomModel +from llmebench.tasks import NewsCategorizationTask, TaskType + + +def config(): + return { + "dataset": SANADAlKhaleejDataset, + "dataset_args": {}, + "task": NewsCategorizationTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "politics", + "religion", + "medical", + "sports", + "tech", + "finance", + "culture", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/semantics/NLI/XNLI_Random.py b/assets/ar/semantics/NLI/XNLI_Random.py new file mode 100644 index 00000000..cacd6b9e --- /dev/null +++ b/assets/ar/semantics/NLI/XNLI_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import XNLIDataset +from llmebench.models import RandomModel +from llmebench.tasks import TaskType, XNLITask + + +def config(): + return { + "dataset": XNLIDataset, + "dataset_args": {}, + "task": XNLITask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["contradiction", "entailment", "neutral"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/semantics/STS/Q2QSim_Random.py b/assets/ar/semantics/STS/Q2QSim_Random.py new file mode 100644 index 00000000..a1a1e0b6 --- /dev/null +++ b/assets/ar/semantics/STS/Q2QSim_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import STSQ2QDataset +from llmebench.models import RandomModel +from llmebench.tasks import Q2QSimDetectionTask, TaskType + + +def config(): + return { + "dataset": STSQ2QDataset, + "dataset_args": {}, + "task": Q2QSimDetectionTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/semantics/STS/SemEval17T1STS_Random.py b/assets/ar/semantics/STS/SemEval17T1STS_Random.py new file mode 100644 index 00000000..6798f690 --- /dev/null +++ b/assets/ar/semantics/STS/SemEval17T1STS_Random.py @@ -0,0 +1,23 @@ +from llmebench.datasets import SemEval17T1STSDataset +from llmebench.models import RandomModel +from llmebench.tasks import STSTask, TaskType + + +def config(): + return { + "dataset": SemEval17T1STSDataset, + "dataset_args": {}, + "task": STSTask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.Regression, "score_range": (0, 5)}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/semantics/STS/SemEval17T2STS_Random.py b/assets/ar/semantics/STS/SemEval17T2STS_Random.py new file mode 100644 index 00000000..35d811ee --- /dev/null +++ b/assets/ar/semantics/STS/SemEval17T2STS_Random.py @@ -0,0 +1,23 @@ +from llmebench.datasets import SemEval17T2STSDataset +from llmebench.models import RandomModel +from llmebench.tasks import STSTask, TaskType + + +def config(): + return { + "dataset": SemEval17T2STSDataset, + "dataset_args": {}, + "task": STSTask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.Regression, "score_range": (0, 5)}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sentiment_emotion_others/emotion/Emotion_Random.py b/assets/ar/sentiment_emotion_others/emotion/Emotion_Random.py new file mode 100644 index 00000000..5aab8975 --- /dev/null +++ b/assets/ar/sentiment_emotion_others/emotion/Emotion_Random.py @@ -0,0 +1,56 @@ +from llmebench.datasets import EmotionDataset +from llmebench.models import RandomModel +from llmebench.tasks import EmotionTask, TaskType + + +def config(): + return { + "dataset": EmotionDataset, + "dataset_args": {}, + "task": EmotionTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "anger", + "disgust", + "fear", + "joy", + "love", + "optimism", + "pessimism", + "sadness", + "surprise", + "trust", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + emotions_positions = { + "anger": 0, + "anticipation": 1, + "disgust": 2, + "fear": 3, + "joy": 4, + "love": 5, + "optimism": 6, + "pessimism": 7, + "sadness": 8, + "surprise": 9, + "trust": 10, + } + + results = [0] * 11 + + for emotion in emotions_positions: + if emotion in response["random_response"]: + results[emotions_positions[emotion]] = 1 + return results diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_Random.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_Random.py new file mode 100644 index 00000000..dbfc6a88 --- /dev/null +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm2_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import ArSarcasm2Dataset +from llmebench.models import RandomModel +from llmebench.tasks import SarcasmTask, TaskType + + +def config(): + return { + "dataset": ArSarcasm2Dataset, + "dataset_args": {}, + "task": SarcasmTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["TRUE", "FALSE"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_Random.py b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_Random.py new file mode 100644 index 00000000..5574d4c6 --- /dev/null +++ b/assets/ar/sentiment_emotion_others/sarcasm/ArSarcasm_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import ArSarcasmDataset +from llmebench.models import RandomModel +from llmebench.tasks import SarcasmTask, TaskType + + +def config(): + return { + "dataset": ArSarcasmDataset, + "dataset_args": {}, + "task": SarcasmTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["TRUE", "FALSE"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sentiment_emotion_others/sentiment/ArSAS_Random.py b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_Random.py new file mode 100644 index 00000000..2dbeb57e --- /dev/null +++ b/assets/ar/sentiment_emotion_others/sentiment/ArSAS_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import ArSASDataset +from llmebench.models import RandomModel +from llmebench.tasks import SentimentTask, TaskType + + +def config(): + return { + "dataset": ArSASDataset, + "dataset_args": {}, + "task": SentimentTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["Positive", "Negative", "Neutral", "Mixed"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Random.py b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Random.py new file mode 100644 index 00000000..78ec0999 --- /dev/null +++ b/assets/ar/sentiment_emotion_others/stance_detection/ANSStance_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import ANSStanceDataset +from llmebench.models import RandomModel +from llmebench.tasks import StanceTask, TaskType + + +def config(): + return { + "dataset": ANSStanceDataset, + "dataset_args": {}, + "task": StanceTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["agree", "disagree"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_Random.py b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_Random.py new file mode 100644 index 00000000..c45214ba --- /dev/null +++ b/assets/ar/sentiment_emotion_others/stance_detection/UnifiedFCStance_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import UnifiedFCStanceDataset +from llmebench.models import RandomModel +from llmebench.tasks import StanceTask, TaskType + + +def config(): + return { + "dataset": UnifiedFCStanceDataset, + "dataset_args": {}, + "task": StanceTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["agree", "disagree", "discuss", "unrelated"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_Random.py b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_Random.py new file mode 100644 index 00000000..07b7bf60 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/ANERcorp_Random.py @@ -0,0 +1,36 @@ +from llmebench.datasets import ANERcorpDataset +from llmebench.models import RandomModel +from llmebench.tasks import NERTask, TaskType + + +def config(): + return { + "dataset": ANERcorpDataset, + "dataset_args": {}, + "task": NERTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "B-PERS", + "I-PERS", + "B-LOC", + "I-LOC", + "B-ORG", + "I-ORG", + "B-MISC", + "I-MISC", + "O", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_Random.py b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_Random.py new file mode 100644 index 00000000..53474532 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/Aqmar_Random.py @@ -0,0 +1,36 @@ +from llmebench.datasets import AqmarDataset +from llmebench.models import RandomModel +from llmebench.tasks import NERTask, TaskType + + +def config(): + return { + "dataset": AqmarDataset, + "dataset_args": {}, + "task": NERTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "B-PERS", + "I-PERS", + "B-LOC", + "I-LOC", + "B-ORG", + "I-ORG", + "B-MISC", + "I-MISC", + "O", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_Random.py b/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_Random.py new file mode 100644 index 00000000..bdbab1e6 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/NER/MGBWords_Random.py @@ -0,0 +1,36 @@ +from llmebench.datasets import MGBWordsDataset +from llmebench.models import RandomModel +from llmebench.tasks import NERTask, TaskType + + +def config(): + return { + "dataset": MGBWordsDataset, + "dataset_args": {}, + "task": NERTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "B-PERS", + "I-PERS", + "B-LOC", + "I-LOC", + "B-ORG", + "I-ORG", + "B-MISC", + "I-MISC", + "O", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_ZeroShot.py similarity index 100% rename from assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabicPOS_GPT4_ZeroShot.py rename to assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_GPT4_ZeroShot.py diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_Random.py b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_Random.py new file mode 100644 index 00000000..e063e872 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/QCRIDialectalArabic_Random.py @@ -0,0 +1,48 @@ +from llmebench.datasets import QCRIDialectalArabicPOSDataset +from llmebench.models import RandomModel +from llmebench.tasks import ArabicPOSTask, TaskType + + +def config(): + return { + "dataset": QCRIDialectalArabicPOSDataset, + "dataset_args": {}, + "task": ArabicPOSTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "ADJ", + "ADV", + "CASE", + "CONJ", + "DET", + "EMOT", + "FOREIGN", + "FUT_PART", + "HASH", + "MENTION", + "NEG_PART", + "NOUN", + "NSUFF", + "NUM", + "PART", + "PREP", + "PROG_PART", + "PRON", + "PUNC", + "URL", + "V", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_Random.py b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_Random.py new file mode 100644 index 00000000..17623531 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/WikiNews_Random.py @@ -0,0 +1,54 @@ +from llmebench.datasets import WikiNewsPOSDataset +from llmebench.models import RandomModel +from llmebench.tasks import ArabicPOSTask, TaskType + + +def config(): + return { + "dataset": WikiNewsPOSDataset, + "dataset_args": {}, + "task": ArabicPOSTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "ABBREV", + "ADJ", + "ADJ/CONJ", + "ADJ/DET", + "ADJ/NUM", + "ADV", + "CASE", + "CONJ", + "DET", + "FOREIGN", + "FUT_PART", + "NOUN", + "NOUN/DET", + "NSUFF", + "NSUFF/ADJ", + "NSUFF/DET", + "NSUFF/NOUN", + "NUM", + "PART", + "PART/CONJ", + "PART/NOUN", + "PART/PART", + "PART/PREP", + "PREP", + "PRON", + "PUNC", + "V", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_Random.py b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_Random.py new file mode 100644 index 00000000..6a559889 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/POS/XGLUE_Random.py @@ -0,0 +1,43 @@ +from llmebench.datasets import XGLUEPOSDataset +from llmebench.models import RandomModel +from llmebench.tasks import ArabicPOSTask, TaskType + + +def config(): + return { + "dataset": XGLUEPOSDataset, + "dataset_args": {}, + "task": ArabicPOSTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.SequenceLabeling, + "class_labels": [ + "ADJ", + "ADP", + "ADV", + "AUX", + "CCONJ", + "DET", + "INTJ", + "NOUN", + "NUM", + "PART", + "PRON", + "PROPN", + "PUNCT", + "SYM", + "VERB", + "X", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_Random.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_Random.py new file mode 100644 index 00000000..bb78221a --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/BibleMaghrebi_Random.py @@ -0,0 +1,64 @@ +import random + +from llmebench.datasets import BibleMaghrebiDiacritizationDataset +from llmebench.models import RandomModel +from llmebench.tasks import ArabicDiacritizationTask, TaskType + + +def config(): + return { + "dataset": BibleMaghrebiDiacritizationDataset, + "dataset_args": {}, + "task": ArabicDiacritizationTask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.Other}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + diacritics = [ + "\u064E", # Fatha + "\u064B", # Fathatan + "\u064F", # Damma + "\u064C", # Dammatan + "\u0650", # Kasra + "\u064D", # Kasratan + "\u0652", # Sukun + "\u0651", # Shadda + None, + ] + + shadda_diacritics = [ + "\u064E", # Fatha + "\u064B", # Fathatan + "\u064F", # Damma + "\u064C", # Dammatan + "\u0650", # Kasra + "\u064D", # Kasratan + None, + ] + + diacritized_sentence = [] + for token in response["random_response"].split(" "): + new_token = [] + for character in token: + new_token.append(character) + + diacritic = random.choice(diacritics) + if diacritic is None: + continue + + if diacritic is diacritics[-2]: # Shadda + extra_diacritic = random.choice(shadda_diacritics) + if extra_diacritic: + new_token.append(extra_diacritic) + new_token.append(diacritic) + diacritized_sentence.append("".join(new_token)) + + return " ".join(diacritized_sentence) diff --git a/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_Random.py b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_Random.py new file mode 100644 index 00000000..9acf2f92 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/diacritization/WikiNews_Random.py @@ -0,0 +1,64 @@ +import random + +from llmebench.datasets import WikiNewsDiacritizationDataset +from llmebench.models import RandomModel +from llmebench.tasks import ArabicDiacritizationTask, TaskType + + +def config(): + return { + "dataset": WikiNewsDiacritizationDataset, + "dataset_args": {}, + "task": ArabicDiacritizationTask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.Other}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + diacritics = [ + "\u064E", # Fatha + "\u064B", # Fathatan + "\u064F", # Damma + "\u064C", # Dammatan + "\u0650", # Kasra + "\u064D", # Kasratan + "\u0652", # Sukun + "\u0651", # Shadda + None, + ] + + shadda_diacritics = [ + "\u064E", # Fatha + "\u064B", # Fathatan + "\u064F", # Damma + "\u064C", # Dammatan + "\u0650", # Kasra + "\u064D", # Kasratan + None, + ] + + diacritized_sentence = [] + for token in response["random_response"].split(" "): + new_token = [] + for character in token: + new_token.append(character) + + diacritic = random.choice(diacritics) + if diacritic is None: + continue + + if diacritic is diacritics[-2]: # Shadda + extra_diacritic = random.choice(shadda_diacritics) + if extra_diacritic: + new_token.append(extra_diacritic) + new_token.append(diacritic) + diacritized_sentence.append("".join(new_token)) + + return " ".join(diacritized_sentence) diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_Random.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_Random.py new file mode 100644 index 00000000..e4c5e249 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/ADI_Random.py @@ -0,0 +1,42 @@ +from llmebench.datasets import ADIDataset +from llmebench.models import RandomModel +from llmebench.tasks import DialectIDTask, TaskType + + +def config(): + return { + "dataset": ADIDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "egy", + "ira", + "jor", + "ksa", + "kuw", + "leb", + "lib", + "mor", + "msa", + "pal", + "qat", + "sud", + "syr", + "uae", + "YEM", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_Random.py b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_Random.py new file mode 100644 index 00000000..20e6d85f --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/dialect_identification/QADI_Random.py @@ -0,0 +1,46 @@ +from llmebench.datasets import QADIDataset +from llmebench.models import RandomModel +from llmebench.tasks import DialectIDTask, TaskType + + +def config(): + return { + "dataset": QADIDataset, + "dataset_args": {}, + "task": DialectIDTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": [ + "EG", + "DZ", + "SD", + "YE", + "SY", + "TN", + "AE", + "JO", + "LY", + "PS", + "OM", + "LB", + "KW", + "QA", + "BH", + "MSA", + "SA", + "IQ", + "MA", + ], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_Random.py b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_Random.py new file mode 100644 index 00000000..4c7e923c --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/lemmatization/WikiNews_Random.py @@ -0,0 +1,39 @@ +import random + +from llmebench.datasets import WikiNewsLemmatizationDataset +from llmebench.models import RandomModel +from llmebench.tasks import LemmatizationTask, TaskType + + +def config(): + return { + "dataset": WikiNewsLemmatizationDataset, + "dataset_args": {}, + "task": LemmatizationTask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.Other}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + lemmatized_sentence = [] + + for token in response["random_response"].split(" "): + if len(token) > 1: + # Keep atleast 1 character in lemma + nonlemma_length = random.choice(range(len(token) - 1)) + + prefix_length = random.choice(range(nonlemma_length + 1)) + suffix_length = nonlemma_length - prefix_length + + token = token[prefix_length : len(token) - suffix_length] + + lemmatized_sentence.append(token) + + return (None, " ".join(lemmatized_sentence)) diff --git a/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_Random.py b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_Random.py new file mode 100644 index 00000000..b4ca6cd4 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/parsing/PADT_Random.py @@ -0,0 +1,31 @@ +import random + +from llmebench.datasets import PADTDataset +from llmebench.models import RandomModel +from llmebench.tasks import ArabicParsingTask, TaskType + + +def config(): + return { + "dataset": PADTDataset, + "dataset_args": {}, + "task": ArabicParsingTask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.Other}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + tokens = response["random_response"].split(" ") + random_labels = [str(idx) for idx in range(len(tokens) + 1)] + random_response = { + str(idx + 1): random.choice(random_labels) for idx in range(len(tokens)) + } + + return random_response diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_Random.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_Random.py new file mode 100644 index 00000000..5b815f1e --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/QCRIDialectalArabic_Random.py @@ -0,0 +1,39 @@ +import random + +from llmebench.datasets import QCRIDialectalArabicSegmentationDataset +from llmebench.models import RandomModel +from llmebench.tasks import ArabicSegmentationTask, TaskType + + +def config(): + return { + "dataset": QCRIDialectalArabicSegmentationDataset, + "dataset_args": {}, + "task": ArabicSegmentationTask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.Other}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + segmented_sentence = [] + for token in response["random_response"].split(" "): + if len(token) > 1: + num_segments = min(len(token) - 1, random.choice([0, 1, 2])) + for segment_idx, segment_loc in enumerate( + sorted(random.sample(range(len(token) - 1), k=num_segments)) + ): + token = ( + token[: segment_idx + segment_loc + 1] + + "+" + + token[segment_idx + segment_loc + 1 :] + ) + segmented_sentence.append(token) + + return " ".join(segmented_sentence) diff --git a/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_Random.py b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_Random.py new file mode 100644 index 00000000..bdc51e62 --- /dev/null +++ b/assets/ar/sequence_tagging_and_information_extraction/segmentation/WikiNews_Random.py @@ -0,0 +1,39 @@ +import random + +from llmebench.datasets import WikiNewsSegmentationDataset +from llmebench.models import RandomModel +from llmebench.tasks import ArabicSegmentationTask, TaskType + + +def config(): + return { + "dataset": WikiNewsSegmentationDataset, + "dataset_args": {}, + "task": ArabicSegmentationTask, + "task_args": {}, + "model": RandomModel, + "model_args": {"task_type": TaskType.Other}, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + segmented_sentence = [] + for token in response["random_response"].split(" "): + if len(token) > 1: + num_segments = min(len(token) - 1, random.choice([0, 1, 2])) + for segment_idx, segment_loc in enumerate( + sorted(random.sample(range(len(token) - 1), k=num_segments)) + ): + token = ( + token[: segment_idx + segment_loc + 1] + + "+" + + token[segment_idx + segment_loc + 1 :] + ) + segmented_sentence.append(token) + + return " ".join(segmented_sentence) diff --git a/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py new file mode 100644 index 00000000..1e816c30 --- /dev/null +++ b/assets/bg/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import RandomModel +from llmebench.tasks import CheckworthinessTask, TaskType + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {"test_split": "bg"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_Random.py b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_Random.py new file mode 100644 index 00000000..15bb1461 --- /dev/null +++ b/assets/bn/sentiment_emotion_others/sentiment/BanglaSentiment_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import BanglaSentimentDataset +from llmebench.models import RandomModel +from llmebench.tasks import SentimentTask, TaskType + + +def config(): + return { + "dataset": BanglaSentimentDataset, + "dataset_args": {}, + "task": SentimentTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["Positive", "Negative", "Neutral"], + }, + "general_args": {}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py new file mode 100644 index 00000000..4b46ce59 --- /dev/null +++ b/assets/de/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py @@ -0,0 +1,55 @@ +from llmebench.datasets import SemEval23T3PropagandaDataset +from llmebench.models import RandomModel +from llmebench.tasks import MultilabelPropagandaTask, TaskType + + +def config(): + return { + "dataset": SemEval23T3PropagandaDataset, + "dataset_args": {"techniques_path": "techniques_subtask3.txt"}, + "task": MultilabelPropagandaTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "Appeal_to_Authority", + "Appeal_to_Fear-Prejudice", + "Appeal_to_Hypocrisy", + "Appeal_to_Popularity", + "Appeal_to_Time", + "Appeal_to_Values", + "Causal_Oversimplification", + "Consequential_Oversimplification", + "Conversation_Killer", + "Doubt", + "Exaggeration-Minimisation", + "False_Dilemma-No_Choice", + "Flag_Waving", + "Guilt_by_Association", + "Loaded_Language", + "Name_Calling-Labeling", + "Obfuscation-Vagueness-Confusion", + "Questioning_the_Reputation", + "Red_Herring", + "Repetition", + "Slogans", + "Straw_Man", + "Whataboutism", + "no_technique", + ], + }, + "general_args": {"test_split": "de/dev"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + random_labels = response["random_response"] + if "no_technique" in random_labels: + return ["no_technique"] + else: + return random_labels diff --git a/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py new file mode 100644 index 00000000..f8ea8238 --- /dev/null +++ b/assets/en/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import RandomModel +from llmebench.tasks import CheckworthinessTask, TaskType + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {"test_split": "en"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py new file mode 100644 index 00000000..1e1e6413 --- /dev/null +++ b/assets/en/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py @@ -0,0 +1,55 @@ +from llmebench.datasets import SemEval23T3PropagandaDataset +from llmebench.models import RandomModel +from llmebench.tasks import MultilabelPropagandaTask, TaskType + + +def config(): + return { + "dataset": SemEval23T3PropagandaDataset, + "dataset_args": {"techniques_path": "techniques_subtask3.txt"}, + "task": MultilabelPropagandaTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "Appeal_to_Authority", + "Appeal_to_Fear-Prejudice", + "Appeal_to_Hypocrisy", + "Appeal_to_Popularity", + "Appeal_to_Time", + "Appeal_to_Values", + "Causal_Oversimplification", + "Consequential_Oversimplification", + "Conversation_Killer", + "Doubt", + "Exaggeration-Minimisation", + "False_Dilemma-No_Choice", + "Flag_Waving", + "Guilt_by_Association", + "Loaded_Language", + "Name_Calling-Labeling", + "Obfuscation-Vagueness-Confusion", + "Questioning_the_Reputation", + "Red_Herring", + "Repetition", + "Slogans", + "Straw_Man", + "Whataboutism", + "no_technique", + ], + }, + "general_args": {"test_split": "en/dev"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + random_labels = response["random_response"] + if "no_technique" in random_labels: + return ["no_technique"] + else: + return random_labels diff --git a/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py new file mode 100644 index 00000000..77ff6913 --- /dev/null +++ b/assets/es/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import RandomModel +from llmebench.tasks import CheckworthinessTask, TaskType + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {"test_split": "es"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py new file mode 100644 index 00000000..555d6611 --- /dev/null +++ b/assets/fr/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py @@ -0,0 +1,55 @@ +from llmebench.datasets import SemEval23T3PropagandaDataset +from llmebench.models import RandomModel +from llmebench.tasks import MultilabelPropagandaTask, TaskType + + +def config(): + return { + "dataset": SemEval23T3PropagandaDataset, + "dataset_args": {"techniques_path": "techniques_subtask3.txt"}, + "task": MultilabelPropagandaTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "Appeal_to_Authority", + "Appeal_to_Fear-Prejudice", + "Appeal_to_Hypocrisy", + "Appeal_to_Popularity", + "Appeal_to_Time", + "Appeal_to_Values", + "Causal_Oversimplification", + "Consequential_Oversimplification", + "Conversation_Killer", + "Doubt", + "Exaggeration-Minimisation", + "False_Dilemma-No_Choice", + "Flag_Waving", + "Guilt_by_Association", + "Loaded_Language", + "Name_Calling-Labeling", + "Obfuscation-Vagueness-Confusion", + "Questioning_the_Reputation", + "Red_Herring", + "Repetition", + "Slogans", + "Straw_Man", + "Whataboutism", + "no_technique", + ], + }, + "general_args": {"test_split": "fr/dev"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + random_labels = response["random_response"] + if "no_technique" in random_labels: + return ["no_technique"] + else: + return random_labels diff --git a/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py new file mode 100644 index 00000000..7c73d079 --- /dev/null +++ b/assets/it/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py @@ -0,0 +1,55 @@ +from llmebench.datasets import SemEval23T3PropagandaDataset +from llmebench.models import RandomModel +from llmebench.tasks import MultilabelPropagandaTask, TaskType + + +def config(): + return { + "dataset": SemEval23T3PropagandaDataset, + "dataset_args": {"techniques_path": "techniques_subtask3.txt"}, + "task": MultilabelPropagandaTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "Appeal_to_Authority", + "Appeal_to_Fear-Prejudice", + "Appeal_to_Hypocrisy", + "Appeal_to_Popularity", + "Appeal_to_Time", + "Appeal_to_Values", + "Causal_Oversimplification", + "Consequential_Oversimplification", + "Conversation_Killer", + "Doubt", + "Exaggeration-Minimisation", + "False_Dilemma-No_Choice", + "Flag_Waving", + "Guilt_by_Association", + "Loaded_Language", + "Name_Calling-Labeling", + "Obfuscation-Vagueness-Confusion", + "Questioning_the_Reputation", + "Red_Herring", + "Repetition", + "Slogans", + "Straw_Man", + "Whataboutism", + "no_technique", + ], + }, + "general_args": {"test_split": "it/dev"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + random_labels = response["random_response"] + if "no_technique" in random_labels: + return ["no_technique"] + else: + return random_labels diff --git a/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py new file mode 100644 index 00000000..dd15325d --- /dev/null +++ b/assets/nl/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import RandomModel +from llmebench.tasks import CheckworthinessTask, TaskType + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {"test_split": "nl"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py new file mode 100644 index 00000000..fb80159c --- /dev/null +++ b/assets/pl/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py @@ -0,0 +1,55 @@ +from llmebench.datasets import SemEval23T3PropagandaDataset +from llmebench.models import RandomModel +from llmebench.tasks import MultilabelPropagandaTask, TaskType + + +def config(): + return { + "dataset": SemEval23T3PropagandaDataset, + "dataset_args": {"techniques_path": "techniques_subtask3.txt"}, + "task": MultilabelPropagandaTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "Appeal_to_Authority", + "Appeal_to_Fear-Prejudice", + "Appeal_to_Hypocrisy", + "Appeal_to_Popularity", + "Appeal_to_Time", + "Appeal_to_Values", + "Causal_Oversimplification", + "Consequential_Oversimplification", + "Conversation_Killer", + "Doubt", + "Exaggeration-Minimisation", + "False_Dilemma-No_Choice", + "Flag_Waving", + "Guilt_by_Association", + "Loaded_Language", + "Name_Calling-Labeling", + "Obfuscation-Vagueness-Confusion", + "Questioning_the_Reputation", + "Red_Herring", + "Repetition", + "Slogans", + "Straw_Man", + "Whataboutism", + "no_technique", + ], + }, + "general_args": {"test_split": "pl/dev"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + random_labels = response["random_response"] + if "no_technique" in random_labels: + return ["no_technique"] + else: + return random_labels diff --git a/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py new file mode 100644 index 00000000..274cdf29 --- /dev/null +++ b/assets/ru/factuality_disinformation_harmful_content/propaganda/SemEval23T3Propaganda_Random.py @@ -0,0 +1,55 @@ +from llmebench.datasets import SemEval23T3PropagandaDataset +from llmebench.models import RandomModel +from llmebench.tasks import MultilabelPropagandaTask, TaskType + + +def config(): + return { + "dataset": SemEval23T3PropagandaDataset, + "dataset_args": {"techniques_path": "techniques_subtask3.txt"}, + "task": MultilabelPropagandaTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.MultiLabelClassification, + "class_labels": [ + "Appeal_to_Authority", + "Appeal_to_Fear-Prejudice", + "Appeal_to_Hypocrisy", + "Appeal_to_Popularity", + "Appeal_to_Time", + "Appeal_to_Values", + "Causal_Oversimplification", + "Consequential_Oversimplification", + "Conversation_Killer", + "Doubt", + "Exaggeration-Minimisation", + "False_Dilemma-No_Choice", + "Flag_Waving", + "Guilt_by_Association", + "Loaded_Language", + "Name_Calling-Labeling", + "Obfuscation-Vagueness-Confusion", + "Questioning_the_Reputation", + "Red_Herring", + "Repetition", + "Slogans", + "Straw_Man", + "Whataboutism", + "no_technique", + ], + }, + "general_args": {"test_split": "ru/dev"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + random_labels = response["random_response"] + if "no_technique" in random_labels: + return ["no_technique"] + else: + return random_labels diff --git a/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py new file mode 100644 index 00000000..2094a028 --- /dev/null +++ b/assets/tr/factuality_disinformation_harmful_content/checkworthyness/CT22Checkworthiness_Random.py @@ -0,0 +1,26 @@ +from llmebench.datasets import CT22CheckworthinessDataset +from llmebench.models import RandomModel +from llmebench.tasks import CheckworthinessTask, TaskType + + +def config(): + return { + "dataset": CT22CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": RandomModel, + "model_args": { + "task_type": TaskType.Classification, + "class_labels": ["0", "1"], + }, + "general_args": {"test_split": "tr"}, + } + + +def prompt(input_sample): + return input_sample + + +def post_process(response): + return response["random_response"] diff --git a/llmebench/datasets/ADI.py b/llmebench/datasets/ADI.py index 18f86ccc..9e98b0df 100644 --- a/llmebench/datasets/ADI.py +++ b/llmebench/datasets/ADI.py @@ -23,20 +23,20 @@ def metadata(): }, "task_type": TaskType.Classification, "class_labels": [ - "EGY", - "IRA", - "JOR", - "KSA", - "KUW", - "LEB", - "LIB", - "MOR", - "MSA", - "PAL", - "QAT", - "SUD", - "SYR", - "UAE", + "egy", + "ira", + "jor", + "ksa", + "kuw", + "leb", + "lib", + "mor", + "msa", + "pal", + "qat", + "sud", + "syr", + "uae", "YEM", ], } diff --git a/llmebench/models/Random.py b/llmebench/models/Random.py new file mode 100644 index 00000000..75c2f25a --- /dev/null +++ b/llmebench/models/Random.py @@ -0,0 +1,75 @@ +import random + +from llmebench.models.model_base import ModelBase + +from llmebench.tasks import TaskType + + +class RandomModel(ModelBase): + def __init__(self, task_type, **kwargs): + self.task_type = task_type + if self.task_type == TaskType.Classification: + self.class_labels = kwargs["class_labels"] + elif self.task_type == TaskType.SequenceLabeling: + self.class_labels = kwargs["class_labels"] + elif self.task_type == TaskType.MultiLabelClassification: + self.class_labels = kwargs["class_labels"] + elif self.task_type == TaskType.Regression: + self.score_range = kwargs["score_range"] + else: + self.task_type = TaskType.Other + + random.seed(2023) + + super(RandomModel, self).__init__(**kwargs) + + def summarize_response(self, response): + if "random_response" in response: + return response["random_response"] + + return None + + def prompt(self, processed_input): + if self.task_type == TaskType.Classification: + random_response = random.choice(self.class_labels) + elif self.task_type == TaskType.SequenceLabeling: + assert isinstance( + processed_input, str + ), "RandomModel only works with string `input` for labeling tasks" + random_response = " ".join( + [random.choice(self.class_labels) for _ in processed_input.split()] + ) + elif self.task_type == TaskType.MultiLabelClassification: + random_response = [ + label for label in self.class_labels if random.random() > 0.5 + ] + elif self.task_type == TaskType.Regression: + min_val, max_val = self.score_range + random_response = min_val + random.random() * (max_val - min_val) + else: + random_response = processed_input + + # elif self.task_type == "labeling-index": + # assert isinstance( + # processed_input, str + # ), "RandomModel only works with string `input` for labeling tasks" + # tokens = processed_input.split() + # random_labels = [str(idx) for idx in range(len(tokens) + 1)] + # random_response = { + # str(idx + 1): random.choice(random_labels) for idx in range(len(tokens)) + # } + # elif self.task_type == "regression": + # min_val, max_val = self.score_range + # random_response = min_val + random.random() * (max_val - min_val) + # elif self.task_type == "multilabel-list": + + # elif self.task_type == "multilabel-length": + # random_response = [ + # random.choice(self.class_labels) for _ in range(self.num_labels) + # ] + # elif self.task_type == "identity": + # random_response = processed_input + # else: + # raise ValueError(f"Unsupported task_type {self.task_type} in RandomModel") + + return {"random_response": random_response} diff --git a/llmebench/models/__init__.py b/llmebench/models/__init__.py index 8afcd76d..facffec8 100644 --- a/llmebench/models/__init__.py +++ b/llmebench/models/__init__.py @@ -1,3 +1,4 @@ from .HuggingFaceInferenceAPI import HuggingFaceInferenceAPIModel, HuggingFaceTaskTypes from .OpenAI import LegacyOpenAIModel, OpenAIModel from .Petals import PetalsModel +from .Random import RandomModel