GenBench · OpResearchQA · Aug 2, 2023 · Aug 2, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/.gitignore b/.gitignore
@@ -87,13 +87,20 @@ celerybeat-schedule
 *.sage.py
 
 # Environments
+.genbench_venv
+.submission_venv
 .env
 .venv
+submission_
+genbench_venv/
+submission_venv/
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
+genbench_venv.bak/
+submission_venv.bak/
 
 # Spyder project settings
 .spyderproject

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
     # We use Click to power our command line interface
     "Click>=8.1",
     # Jsonnet is used as a format for task configs
-    "jsonnet>=0.20",
+    "jsonnet-binary",
     # dataclass_factory is used to convert jsonnet to dataclasses
     "dataclass_factory>=2.16",
     # We use datasets to load data from HuggingFace datasets

diff --git a/src/cot_inference.py b/src/cot_inference.py
@@ -0,0 +1,151 @@
+import json
+import os
+import random
+import time
+import warnings
+from argparse import ArgumentParser
+from typing import Dict, List
+
+from huggingface_hub import InferenceClient
+from tqdm import tqdm
+
+import final_inference_utils
+from genbench import load_task
+from genbench.api import PreparationStrategy
+
+
+warnings.filterwarnings("ignore", message="Unverified HTTPS request is being made to host*")
+
+os.environ["CURL_CA_BUNDLE"] = ""
+folder_path = final_inference_utils.get_folder_path("cot")
+
+# LLM Parameters
+model_7b_chat = "meta-llama/Llama-2-7b-chat-hf"
+model_70b_chat = "meta-llama/Llama-2-70b-chat-hf"
+model_13b_chat = "meta-llama/Llama-2-13b-chat-hf"
+model_vicuna_13B_v1dot5 = "lmsys/vicuna-13b-v1.5-16k"
+model_starcoder_chat = "HuggingFaceH4/starchat-beta"
+
+
+llm_hf_dict = {
+    "llama2-7b-chat": model_7b_chat,
+    "llama2-13b-chat": model_13b_chat,
+    "llama2-70b-chat": model_70b_chat,
+    "starcoderchat": model_starcoder_chat,
+}
+
+# Inference Types
+LLAMA = "llama"
+
+
+def get_input_prompt_for_ans(reasoning_prompt, reasoning):
+    answer_extraction_prompt = "\nAnswer: Therefore, among A through D, the answer is ("
+    return reasoning_prompt + reasoning + answer_extraction_prompt
+
+
+def ltr_to_idx(ltr):
+    if len(ltr) != 1:
+        return -10
+    return ord(ltr) - ord("A")
+
+
+def get_config_and_api_key_name_from_args():
+    parser = ArgumentParser()
+    parser.add_argument("--ds_name", help="orqa", default="cot", required=False)
+    parser.add_argument("--model_name", help="Model name", default="llama2-7b-chat", required=False)
+    parser.add_argument("--n_shots", type=int, help="# of shots", default=0, required=False)
+    parser.add_argument("--verbose", type=int, help="1 to debug", default=0, required=False)
+
+    args = parser.parse_args()
+    print(args.verbose)
+    return args
+
+
+def get_llama2_predictions(ds, ds_ans, model_name):
+    cot_pipeline = InferenceClient(model=model_name).text_generation
+
+    prediction = []
+    counter = 0
+    for (
+        ds_i,
+        ds_ans_i,
+    ) in tqdm(zip(ds, ds_ans), total=len(ds_ans)):
+        seq = cot_pipeline(ds_i["input"], do_sample=False, max_new_tokens=500, return_full_text=False)
+
+        new_input_prompt = get_input_prompt_for_ans(ds_ans_i["input"], seq)
+
+        seq = cot_pipeline(new_input_prompt, do_sample=False, max_new_tokens=1, return_full_text=False)
+
+        llm_result = -10
+        llm_str_op = seq
+        if llm_str_op.strip() in {"A", "B", "C", "D"}:
+            llm_result = ltr_to_idx(llm_str_op)
+
+        # If llm_result is not in the target_options then We are assuming the result to be wrong.
+        if llm_result > 3 or llm_result < 0:
+            llm_result = (ltr_to_idx(ds_i["original_target"]) + 1) % len(ds_i["target_options"])
+            counter += 1
+
+        prediction.append(
+            {
+                "reasoning_extraction_prompt": ds_i["input"],
+                "answer_extraction_prompt": str(new_input_prompt),
+                "target": llm_result,
+            }
+        )
+
+    print(f"{counter}times LLM genrated answer that was not in the target options")
+
+    return prediction
+
+
+def generate_llm_prediction(ds, ds_ans, type, **kwargs) -> List[Dict]:
+    """
+    Return a list of dicts in the following format
+    [{"target": answer1}, {"target": answer2}, ...]
+    The list should be in the exact same order as the dataset (ds) passed.
+    """
+
+    if type in llm_hf_dict:
+        prediction = get_llama2_predictions(ds, ds_ans, llm_hf_dict[type])
+    else:
+        prediction = [{"target": random.randint(0, 3)} for i in range(len(ds))]
+
+    file_model_name = type.replace("/", "_")
+    file_model_name = file_model_name.replace(".", "dot")
+    os.makedirs(folder_path, exist_ok=True)
+    with open(f"{folder_path}/{file_model_name}_{str(time.time())}.json", "w+") as outfile:
+        outfile.write(json.dumps(prediction, indent=4))
+
+    return prediction
+
+
+if __name__ == "__main__":
+    config = get_config_and_api_key_name_from_args()
+    verbose = config.verbose
+    task = load_task("operationsresearchqa:" + config.ds_name)
+    ds = task.get_prepared_datasets(
+        PreparationStrategy.PROMPT_BASED_TESTING, shot_list=[config.n_shots], reasoning_extraction=True
+    )[config.n_shots]
+
+    task = load_task("operationsresearchqa:" + config.ds_name)
+    ds_ans = task.get_prepared_datasets(
+        PreparationStrategy.PROMPT_BASED_TESTING, shot_list=[config.n_shots], reasoning_extraction=False
+    )[config.n_shots]
+
+    if verbose:
+        print(ds)
+
+    if verbose:
+        for i, (ds_i, ds_ans_i) in enumerate(zip(ds, ds_ans)):
+            print(i)
+            print(json.dumps(ds_i, indent=4))
+            print(json.dumps(ds_ans_i, indent=4))
+            input("next")
+
+    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
+    print("Running model", config.model_name)
+    llm_output = generate_llm_prediction(ds, ds_ans, config.model_name)
+    result = task.evaluate_predictions(predictions=llm_output, gold=ds)
+    print("Score from experiment", result["hf_accuracy__accuracy"])
+    print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
diff --git a/src/final_inference_utils.py b/src/final_inference_utils.py
@@ -0,0 +1,5 @@
+import sys
+
+
+def get_folder_path(experiment_name):
+    return sys.path[0] + "/output_of_llms/" + experiment_name
diff --git a/src/genbench/tasks/operationsresearchqa/Data_creation.png b/src/genbench/tasks/operationsresearchqa/Data_creation.png
diff --git a/src/genbench/tasks/operationsresearchqa/OpResearchQA-EvalCard.PNG b/src/genbench/tasks/operationsresearchqa/OpResearchQA-EvalCard.PNG
diff --git a/src/genbench/tasks/operationsresearchqa/__init__.py b/src/genbench/tasks/operationsresearchqa/__init__.py
@@ -0,0 +1,5 @@
+from genbench import TaskDict
+
+
+class Operationsresearchqa(TaskDict):
+    pass
diff --git a/src/genbench/tasks/operationsresearchqa/config.jsonnet b/src/genbench/tasks/operationsresearchqa/config.jsonnet
@@ -0,0 +1,38 @@
+{
+    name: 'OperationsResearchQA',
+
+    description: 'ORQA (or Operations Research QA) is a benchmark to study the generalizability of PLMs towards operations research. We want to ask the question, "Can PLMs reason about operations research?"',
+
+    keywords: [
+        'Operations Research',
+        'Question Answering',
+        'Domain Specialization',
+        'Practical',
+        'Cross Domain',
+        'Covariate Shift',
+        'Naturally Occuring Shift Source',
+        'Pretrain-test Shift Locus',
+        'Knowledge Multiple Choice QA',
+        'Optimization Modeling',
+        'Reasoning',
+        'Context',
+        'Large Language Models',
+    ],
+
+    authors: [
+        'Rindra Ramamonjison',
+        'Mahdi Mostajabdaveh',
+        'Timothy Yu',
+        'Giuseppe Carenini',
+        'Samarendra Dash',
+        'Serge Jabo Byusa',
+        'Zirui Zhou',
+        'Yong Zhang',
+    ],
+
+    subtasks_order: [
+        'standard',
+        'cot',
+
+    ],
+}
diff --git a/src/genbench/tasks/operationsresearchqa/cot/__init__.py b/src/genbench/tasks/operationsresearchqa/cot/__init__.py
diff --git a/src/genbench/tasks/operationsresearchqa/cot/config.jsonnet b/src/genbench/tasks/operationsresearchqa/cot/config.jsonnet
@@ -0,0 +1,69 @@
+{
+    name: 'OperationsResearchQA (cot)',
+
+    description: 'ORQA (or Operations Research QA) is a benchmark to study the generalizability of PLMs towards operations research. We want to ask the question, "Can PLMs reason about operations research?"',
+
+    keywords: [
+        'Operations Research',
+        'Question Answering',
+        'Domain Specialization',
+        'Practical',
+        'Cross Domain',
+        'Covariate Shift',
+        'Naturally Occuring Shift Source',
+        'Pretrain-test Shift Locus',
+        'Knowledge Multiple Choice QA',
+        'Optimization Modeling',
+        'Reasoning',
+        'Context',
+        'Large Language Models',
+    ],
+
+    authors: [
+        'Rindra Ramamonjison',
+        'Mahdi Mostajabdaveh',
+        'Timothy Yu',
+        'Giuseppe Carenini',
+        'Samarendra Dash',
+        'Serge Jabo Byusa',
+        'Zirui Zhou',
+        'Yong Zhang',
+    ],
+
+    data_source: {
+        type: 'manual',
+        test: 'https://raw.githubusercontent.com/OpResearchQA/ORQA_data/main/test_dataset.jsonl', // link to our sample test set
+        validation: 'https://raw.githubusercontent.com/OpResearchQA/ORQA_data/main/val_dataset.jsonl', // link to our sample dev set
+    },
+
+    has_validation_set: true,
+    has_train_set: false,
+
+    task_type: 'multiple_choice',
+
+
+    evaluation_metrics: [
+        {
+            hf_id: 'accuracy',
+            git_commit_sha: '34d6add55811828baef83e0d7c6826e2193f7b6a',
+            best_score: 1.0,
+        }
+    ],
+
+    preparation_strategies: {
+        // A recipe for preparing the model to perform the task by configuring its prompt.
+        // This recipe is suitable for generative LMs such as GPT-3, OPT, T5, etc.
+        // We provide a few options for configuring the prompt. But, the task creator can
+        // also provide a custom prompt preparation in the task's Python class.
+        prompt_based_testing: {
+            prompt_builder: {
+                instruction_zero_shot: '["Given the context (following Context:), provide the chain of thoughts (following Reasoning:) to solve the question (following Question:). Remember, only one option is correct.\\n", "Given the context (following Context:), the reasoning (following Reasoning:), select the most appropriate answer to the question (following Question:). Answer only \'A\', \'B\', \'C\', or \'D\'. There is only one correct answer.\\n"]',
+                instruction_few_shot: '["Given the context (following Context:), provide the chain of thoughts (following Reasoning:) solve to the question (following Question:). Remember, only one option is correct.\\n\\nHere are some examples: \\n\\n", "Given the context (following Context:), the reasoning (following Reasoning:), select the most appropriate answer to the question (following Question:). Answer only \'A\', \'B\', \'C\', or \'D\'. There is only one correct answer.\\n\\nHere are some examples: \\n\\n"]',
+                output_prefix: '["\\nReasoning: Let\'s think step by step ", "\\nAnswer: Among A through D, the answer is ("]',
+                input_prefix: '', // no input prefix since Question follows Context
+                append_choices_to_input: false,
+                few_shot_example_separator: '["\\n\\n", ")\\n\\n"]',
+            }
+        },
+    },
+}
diff --git a/src/genbench/tasks/operationsresearchqa/cot/doc.md b/src/genbench/tasks/operationsresearchqa/cot/doc.md
@@ -0,0 +1,81 @@
+# OperationsResearchQA (cot)
+
+## Summary
+
+**ORQA: Can Pretrained Language Models reason about Operations Research?**
+
+We propose Operations Research QA (ORQA) as a new benchmark to evaluate the ability of pretrained Large Language Models (LLMs) to generalize to new technical domains. Our benchmark considers the cross-domain shift issue of LLMs and focuses on the multi-choice question answering task. In our new dataset, the target domain is Operations Research (OR), and the task tests both the language models' domain-specific knowledge and reasoning skills on optimization modeling. Our dataset is handcrafted by domain experts, i.e. OR experts, and is representative of different types of optimization and various application domains, such as vehicle routing, production planning, or investment portfolio allocation.
+
+We contribute a novel multi-choice QA dataset for specialized technical domains. Each instance of the dataset contains the following:
+
+1. An `input` context describing an optimization model as a case study.
+2. A `question` asking about the understanding of the problem specifications (e.g. objective criterion or constraints), the underlying components of the corresponding optimization model (e.g. the elements participating in the optimization), as well as the structure and logic of the optimization model (e.g. the logical relation between two decision activities).
+3. A list of `options` for the answer, which was handcrafted by OR experts to make the question challenging to guess. The LLM must select the correct answer from a list of four options.
+4. The correct `target` answer.
+5. For chain-of-thoughts, there is a `reasoning` field. Please refer to the accompanying paper submission for more details.
+
+It is important to emphasize that each question is designed to require basic knowledge of optimization, an understanding of the problem description, and, more importantly, multi-step reasoning. We have selected the field of operations research due to the high level of technical expertise required to understand and answer questions about optimization modeling.
+
+We have carefully selected the optimization problems from various domains. Furthermore, each question/choice/answer instance has been curated and verified by domain experts. As a result, ORQA is positioned to be an impactful benchmark that can contribute to the evaluation of general-purpose LLMs' generalization ability on specialized domains and tasks.
+
+CoT is performed in a two-step approach with first a reasoning-eliciting prompt followed by an anser-eliciting prompt. Please refer to the accompanying paper for more details.
+
+## Motivation
+
+Our proposed Operations Research Question Answering (ORQA) benchmark is highly relevant to the GenBench collaborative task, as it addresses the crucial aspect of generalization in natural language processing (NLP) models. Specifically focused on pretrained Large Language Models (LLMs) within the domain of Operations Research (OR), ORQA is designed to tackle the significant cross-domain shift issue. By presenting a multi-choice question answering task that tests reasoning skills and domain-specific knowledge in optimization modeling, ORQA provides a comprehensive evaluation platform.
+
+As the first benchmark of its kind in Operations Research, ORQA serves as an open and collaborative platform to assess LLMs' real-world performance in specialized technical domains. The primary goal is to ensure responsible and well-informed deployment of LLMs, minimizing potential negative impacts. Notably, unlike technical reports from commercial LLM vendors, ORQA is an open-source benchmark, ensuring impartiality and transparency in the evaluation process. This approach aligns perfectly with our motivation to advance research on generalization in NLP and promote the responsible development of LLMs. By creating a benchmark that specifically targets optimization and Operations Research, we aim to foster a collective effort within the NLP community, enhancing the reliability and practical applicability of LLMs in professional domains.
+
+## Examples
+
+Please refer to the data section (Section 4) and the appendix of the accompanying paper for some more examples. Here is one example:
+
+**<u>EXAMPLE 1</u>**
+
+**Context:**
+
+In the bustling district of Paris lies \"La Douceur Parisienne,\" a patisserie celebrated for its exquisite pastries. Mademoiselle Elise, the owner, finds herself in a perpetual balancing act, juggling the freshness of her products with the economics of running her establishment. Every item on her display boasts a distinct shelf life, determining its freshness window. Once this period lapses, the product is no longer up to the patisserie's esteemed standards.  When Mademoiselle Elise places an order for ingredients, a fixed cost is immediately incurred. This cost isn't just a function of the quantity but also of the unique suppliers she sources her premium ingredients from. Alongside this, the selling price of each pastry, when weighed against the purchase cost of its ingredients, paints a picture of the potential profit for every item sold.  However, there's an inherent challenge in procuring these ingredients. They come in defined batches, meaning even if she needs a smaller quantity, the entire batch must be ordered. This batching can sometimes lead to an excess, posing a risk given the perishable nature of her products.  Furthermore, the streets of Paris, with their unpredictable nature, bring another layer of complexity. On some days, a burst of tourists might grace her establishment, while on others, only the familiar faces of locals appear. This ebb and flow in demand means that her ingredient orders must be agile, flexible, and well-timed.  With these intricate challenges, Mademoiselle Elise's mission is clear: to craft an ordering strategy that efficiently straddles between reducing costs from ordering, potential waste, and maximizing sales revenue. Through this balanced approach, she hopes to ensure that every day at \"La Douceur Parisienne\" commences with fresh, impeccable pastries and ends on a note of robust profitability.
+
+**Question:**
+
+What is the type of optimization model related to this problem?
+
+**Reasoning:**
+
+Mademoiselle Elise's objective is to balance costs, potential waste, and maximize sales revenue in her patisserie.\r\nThere are multiple variables to consider, such as ingredient costs, pastry selling prices, and varying demand.\r\nThe problem requires accommodating constraints like fixed ordering costs, perishability of products, batching of ingredients, and unpredictable customer flow.\r\nDespite the complexities, the relationships among these variables and constraints appear to be linear.\r\nBased on the nature of the problem and the linearity of the relationships, the most appropriate optimization model is Linear Programming.\r\nAnswer: (B) Linear Programming.
+
+**Target:**
+
+1
+
+**Target Options:**
+
+0: Power Cone Programming
+
+1: Linear Programming
+
+2: Non-Linear Programming
+
+3: Exponential Cone Programming
+
+
+
+## Usage
+
+The task can be loaded by calling `genbench_cbt\src\cot_inference.py`. This script goes through an inference and evaluation cycle of the standard prompts. It currently supports using the OpenAI API and HuggingFace API. You may alter the code to run inference using an LLM of your choice.
+
+## Data Source
+
+The data creation process is better defined in the accompany paper. The dataset is separated into validation and testing splits with 30 and 1483 samples, respectively. These can be accessed at the following links:
+
+Test - https://raw.githubusercontent.com/OpResearchQA/ORQA_data/main/test_dataset.jsonl
+
+Validation - https://raw.githubusercontent.com/OpResearchQA/ORQA_data/main/val_dataset.jsonl
+
+## Limitations and Bias
+
+Any decision-making problem may be formulated as an optimization model. Thus, the field of OR spans all domains and it is infeasible for our dataset to comprehensively represent all domains. This benchmark is limited by the coverage of domains as it targets more common domains of OR.
+
+## GenBench Eval card
+
+The GenBench Eval card is the same as for the parent ORQA task. These subtasks are to separate the two approaches of standard v. CoT prompting.