GenBench · lranaldii · Aug 29, 2023 · Aug 29, 2023 · Aug 29, 2023 · Aug 29, 2023
diff --git a/src/genbench/tasks/promptcomve/__init__.py b/src/genbench/tasks/promptcomve/__init__.py
@@ -0,0 +1,4 @@
+from genbench import TaskDict
+
+class PromptcomveTask(TaskDict):
+    pass
diff --git a/src/genbench/tasks/promptcomve/config.jsonnet b/src/genbench/tasks/promptcomve/config.jsonnet
@@ -0,0 +1,56 @@
+{
+    name: 'PromptComVE',
+
+        description: 'Commonsense Validation and Explanation (ComVE) is a benchmark to study the generalizability of It-LLMs towards multiple choice answers.',
+
+    keywords: [
+        'commonsense reasoning',
+        'commonsense explainations',
+    ],
+
+    authors: [
+        'Leonardo Ranaldi ',
+        ' Giulia Pucci',
+
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/lranaldii/Prompt-ComVE/main/data/Prompt_ComVE_t1.jsonl',
+    },
+
+
+    has_validation_set: false,
+    has_train_set: false,
+
+      field_mapping: {
+    input: 'prompt',
+    target: 'target',
+  		},
+
+    task_type: 'free_form',
+
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '1379e97f313a00486e9885aec530e1ef00def9b9',
+            best_score: 1.0,
+        }
+    ],
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: '(following Statements), answer choise (answers)\n',
+                output_prefix: '\nAnswer: ',
+                input_prefix: '', // no input prefix since Question follows Context
+                append_choices_to_input: false,
+            }
+        },
+    },
+}
diff --git a/src/genbench/tasks/promptcomve/doc.md b/src/genbench/tasks/promptcomve/doc.md
@@ -0,0 +1,11 @@
+# PromptComVE
+
+## Motivation
+Recently, zero- and few-shot learning with various prompt-based, Instruction-tuned Large Language Models (It-LLMs) have made extraordinary progress. Indeed, prompts seem to help models learn faster, just as humans learn faster when they receive task instructions expressed in natural language. However, these models often produce good predictions with meaningless, irrelevant and misleading prompts, even at zero-shots. This resource propose a benchmark to probe generalization abilities over commonsese reasoning task. Hence, inspired by the Commonsense Validation and Explanation (ComVE), through a systematic analysis, we analyzed several It-LLMs, in particular, whether they are able to distinguish statements that make sense from those that do not by providing comprehensive explanations. Thus, we show that It-LLMs have good generalization abilities and achieve good accuracy in commonsense reasoning tasks. However, despite the impressive performance and improvements, we found some weaknesses that cast doubt on whether the models improved understanding of task instructions in a similar way to humans' use of task instructions.
+
+## Examples
+**match**: {"input": "Which of the two statements is impossible? a)he put an elephant into the fridge. b)he put a turkey into the fridge", "target": "a)"}
+
+
+## GenBench Eval card
+*Describe what kind of generalisation your task is evaluating, and include a [genbench eval card](https://genbench.org/eval_cards/) for your task*.
diff --git a/src/genbench/tasks/promptcomve/task.py b/src/genbench/tasks/promptcomve/task.py
@@ -0,0 +1,8 @@
+from typing import Any, Dict
+
+from genbench import Task
+
+
+class PromptcomveTask(Task):
+    def format_example(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        return {"input": example["prompt"], "target": example["target"]}
diff --git a/src/genbench/tasks/sample_task/__init__.py b/src/genbench/tasks/sample_task/__init__.py
diff --git a/src/genbench/tasks/sample_task/config.jsonnet b/src/genbench/tasks/sample_task/config.jsonnet
diff --git a/src/genbench/tasks/sample_task/doc.md b/src/genbench/tasks/sample_task/doc.md
diff --git a/src/genbench/tasks/sample_task/task.py b/src/genbench/tasks/sample_task/task.py