From 47cbbef43468f97b0e65bfb70149064e3f71d5a3 Mon Sep 17 00:00:00 2001
From: SkySuperCat <cat@tpsri.org>
Date: Tue, 22 Oct 2024 02:46:36 -0400
Subject: [PATCH] Add mmlusr

---
 src/genbench/tasks/mmlusr/__init__.py    |  0
 src/genbench/tasks/mmlusr/config.jsonnet | 54 ++++++++++++++++++++++++
 src/genbench/tasks/mmlusr/doc.md         | 19 +++++++++
 src/genbench/tasks/mmlusr/task.py        | 18 ++++++++
 4 files changed, 91 insertions(+)
 create mode 100644 src/genbench/tasks/mmlusr/__init__.py
 create mode 100644 src/genbench/tasks/mmlusr/config.jsonnet
 create mode 100644 src/genbench/tasks/mmlusr/doc.md
 create mode 100644 src/genbench/tasks/mmlusr/task.py
diff --git a/src/genbench/tasks/mmlusr/__init__.py b/src/genbench/tasks/mmlusr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/genbench/tasks/mmlusr/config.jsonnet b/src/genbench/tasks/mmlusr/config.jsonnet
new file mode 100644
index 0000000..f2720ce
--- /dev/null
+++ b/src/genbench/tasks/mmlusr/config.jsonnet
@@ -0,0 +1,54 @@
+{
+    name: 'mmlusr',
+
+    description: 'mmlusr aims to measure the true comprehension abilities of Large Language Models (LLMs) by challenging their performance in question-answering tasks with modified terms.',
+
+    keywords: [
+        'LLMs',
+        'Benchmarks',
+        'Dataset',
+        'Reasoning'
+    ],
+
+    authors: [
+        'Wentian Wang',
+        'Sarthak Jain',
+        'Paul Kantor',
+        'Jacob Feldman',
+        'Lazaros Gallos',
+        'Hao Wang'
+    ],
+
+    data_source: {
+        type: 'hf',
+        hf_id: [
+            'NiniCat/MMLU-SR',
+            'answer_only_abstract_algebra' //switch to other tasks by altering to some task like 'question_and_answer_abstract_algebra'
+        ],
+        git_commit_sha: '505322b292ac81cc83c0942c2d2930af5ba31068'
+    },
+
+    has_validation_set: false,
+    has_train_set: true,
+
+    task_type: 'multiple_choice',
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            best_score: 1.0,
+            git_commit_sha: '330abb383de68be32352dd876716f644bc71c1e5',  
+        }
+    ],
+
+    preparation_strategies: {
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: 'Please respond to each question with \'Answer: <letter>\' where <letter> is the correct choice. Avoid additional explanations.\n\n',
+                instruction_few_shot: 'Follow the given examples and answer the question. Please respond to each question with \'Answer: <letter>\' where <letter> is the correct choice. Avoid additional explanations. \n\n',
+                input_prefix: 'Q: ',
+                output_prefix: '\nA: '
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/genbench/tasks/mmlusr/doc.md b/src/genbench/tasks/mmlusr/doc.md
new file mode 100644
index 0000000..5d2fa68
--- /dev/null
+++ b/src/genbench/tasks/mmlusr/doc.md
@@ -0,0 +1,19 @@
+# mmlusr
+
+## Abstract
+*We propose MMLU-SR, a novel dataset designed to measure the true comprehension abilities of Large Language Models (LLMs) by challenging their performance in question-answering tasks with modified terms. We reasoned that an agent that "truly" understands a concept can still evaluate it when key terms are replaced by suitably defined alternate terms, and sought to differentiate such comprehension from mere text replacement. In our study, we modified standardized test questions by replacing a key term with a dummy word along with its definition. The key term could be in the context of questions, answers, or both questions and answers. Notwithstanding the high scores achieved by recent popular LLMs on the MMLU leaderboard, we found a substantial reduction in model performance after such replacement, suggesting poor comprehension. This new benchmark provides a rigorous benchmark for testing true model comprehension, and poses a challenge to the broader scientific community.*
+
+## Examples
+*"Suppose 'Dummy' means 'ubiquitous, mostly free-living organisms often consisting of one biological cell.' Which of the following best describes the human body's defense mechanism against environmental Dummy?", Hair in the nose,"Suppose 'Queen' means 'The moist, inner lining of some organs and body cavities' Queen",Suppose 'Noise' means 'cells that form new bones and grow and heal existing bones.' Noise,Suppose 'Bard' means 'an extracellular fluid produced and secreted by salivary glands in the mouth.' Bard,B*
+
+## Usage
+*Please check our git repo: https://github.com/Wang-ML-Lab/MMLU-SR*
+
+## Data Source
+*Dataset can be retrieved from either HuggingFace or Github. HF:NiniCat/MMLU-SR.  Git: https://github.com/Wang-ML-Lab/MMLU-SR*
+
+## Limitations and Bias
+*NA*
+
+## GenBench Eval card
+*There are dev and test datasets, which dev set is for few-shot prompting and test set is the actual evaluation set.*.
diff --git a/src/genbench/tasks/mmlusr/task.py b/src/genbench/tasks/mmlusr/task.py
new file mode 100644
index 0000000..a1b1af3
--- /dev/null
+++ b/src/genbench/tasks/mmlusr/task.py
@@ -0,0 +1,18 @@
+from typing import Any, Dict
+
+from genbench import Task
+
+
+class MMLUSRTask(Task):
+    def format_example(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        options = [example["choice1"], example["choice2"], example["choice3"], example["choice4"]]
+
+        # If the answer is already a number (0-3)
+        if isinstance(example["answer"], (int, float)):
+            target = int(example["answer"])
+        else:
+            # If the answer is a letter (A, B, C, D)
+            answer_map = {"A": 0, "B": 1, "C": 2, "D": 3}
+            target = answer_map[example["answer"]]
+
+        return {"input": example["question"], "target": target, "target_options": options}