diff --git a/README.md b/README.md index 80b01560..8ce2343a 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ Details in [limitations](#limitations). - [Data Release](#data-release) - [Differences with AlpacaFarm](#differences-with-alpacafarm) - [Related work](#related-work) + - [Interpreting Chain of Thought](#interpreting-chain-of-thought-when-available) - [Major updates](#major-updates) @@ -1253,9 +1254,9 @@ For example:

Interpreting Chain of Thought, when available

-For some annotators, e.g. `alpaca_eval_cot_gpt4_turbo_fn` we use chan of thought reasoning to make the models preferences more interpretable. Those can then be found under `raw_annotations` in the `annotations.json` file. We describe how to interpret those below. +For some annotators, e.g. `alpaca_eval_cot_gpt4_turbo_fn` we use chan of thought reasoning to make the models preferences more interpretable. Those can then be found under `concise_explanation` in the `annotations.json` file. To interpret them, you should also look at `referenced_models` which translates the temporary model name (in the prompt) to the actual output. Below, we provide more explanation as to what is happening behind the scenes. -To better understand the auto-annotations, you can check the `raw_annotations["concise_explanation]` column in `annotations.json` (e.g. [here](https://github.com/tatsu-lab/alpaca_eval/tree/main/results/gpt4/alpaca_eval_cot_gpt4_turbo_fn/annotations.json)) which contains the chain of thought reasoning of the auto annotator. Note that the raw_annotations is not modified by the randomization of the order of the outputs. In particular, `"m"` and `"M"` can sometime refer to the first model (the reference) and sometime to the second model (the model being evaluated). To understand which model is being referred to, you should use the column `preference` and `ordered_models`. When pushing the annotations to GitHub we use those to add a dictionary `"referenced_models"` mapping the model names to the corresponding outputs (see [`add_referenced_model_`](https://github.com/tatsu-lab/alpaca_eval/blob/main/docs/format_sample_sheets.py)). For example in the following annotation we see that the preference is 1.0 (i.e. `output_1`) and corresponds to model `M` in `concise_explanation` (see `ordered_models`). +ou can check the `raw_annotations["concise_explanation]` column in `annotations.json` (e.g. [here](https://github.com/tatsu-lab/alpaca_eval/tree/main/results/gpt4/alpaca_eval_cot_gpt4_turbo_fn/annotations.json)) which contains the chain of thought reasoning of the auto annotator. Note that the raw_annotations is not modified by the randomization of the order of the outputs. In particular, `"m"` and `"M"` can sometime refer to the first model (the reference) and sometime to the second model (the model being evaluated). To understand which model is being referred to, you should use the column `preference` and `ordered_models`. To make it easier we add a column `"referenced_models"` mapping the model names to the corresponding outputs. For example in the following annotation we see that the preference is 1.0 (i.e. `output_1`) and corresponds to model `M` in `concise_explanation` (see `ordered_models`). ```json { diff --git a/docs/format_sample_sheets.py b/docs/format_sample_sheets.py index 24a9458c..22ba8324 100644 --- a/docs/format_sample_sheets.py +++ b/docs/format_sample_sheets.py @@ -1,4 +1,3 @@ -import json from pathlib import Path import pandas as pd @@ -9,42 +8,6 @@ RESULTS_DIR = CURRENT_DIR / "results" -def json_load(el): - """Try to load as json""" - try: - return json.loads(el) - except: - return el - - -def add_referenced_model_(df): - """Add a dictionary to better understand chain of thought in case it's useful""" - - for i, r in df.iterrows(): - if ( - isinstance(r["raw_completion"], dict) - and "concise_explanation" in r["raw_completion"] - and "ordered_models" in r["raw_completion"] - ): - preference = int(df.loc[i, "preference"]) - ordered_models = df.loc[i, "raw_completion"]["ordered_models"] - for m in ordered_models: - if m["rank"] == 1: - first_model = m["model"] - elif m["rank"] == 2: - second_model = m["model"] - else: - assert False - - if "referenced_models" not in df.columns: - df["referenced_models"] = None - - df.at[i, "referenced_models"] = { - first_model: f"output_{preference}", - second_model: f"output_{3-preference}", - } - - df_reference = pd.read_json(RESULTS_DIR / "text_davinci_003" / F_OUTPUTS, orient="records") @@ -71,9 +34,4 @@ def add_referenced_model_(df): # df["order"] = df.apply(lambda row: order[(row["dataset"], row["instruction"])], axis=1) # df = df.sort_values("order").drop("order", axis=1) - # jsonify & add the referenced models - if "raw_completion" in df: - df["raw_completion"] = df["raw_completion"].apply(json_load) - add_referenced_model_(df) - df.to_json(f, orient="records", indent=2) diff --git a/src/alpaca_eval/annotators/base.py b/src/alpaca_eval/annotators/base.py index bd1f22a5..faa8b2a7 100644 --- a/src/alpaca_eval/annotators/base.py +++ b/src/alpaca_eval/annotators/base.py @@ -88,7 +88,11 @@ def __init__( annotators_config: Union[utils.AnyPath, list[dict[str, Any]]] = "claude", seed: Optional[int] = 0, is_avoid_reannotations: bool = True, - other_output_keys_to_keep: Sequence[str] = ("price_per_example", "time_per_example", "raw_completion"), + other_output_keys_to_keep: Sequence[str] = ( + "price_per_example", + "time_per_example", + "raw_completion", + ), other_input_keys_to_keep: Sequence[str] = (), is_store_missing_annotations: bool = True, base_dir: Optional[utils.AnyPath] = None, @@ -102,9 +106,6 @@ def __init__( self.is_avoid_reannotations = is_avoid_reannotations self.primary_keys = list(primary_keys) self.all_keys = self.primary_keys + [self.annotator_column] - self.other_output_keys_to_keep = list(other_output_keys_to_keep) - self.other_input_keys_to_keep = list(other_input_keys_to_keep) - self.other_keys_to_keep = self.other_output_keys_to_keep + self.other_input_keys_to_keep self.is_store_missing_annotations = is_store_missing_annotations self.is_raise_if_missing_primary_keys = is_raise_if_missing_primary_keys if isinstance(annotation_type, str): @@ -116,6 +117,10 @@ def __init__( self.annotators = self._initialize_annotators() self.df_annotations = None + self.other_input_keys_to_keep = self._get_other_input_keys_to_keep(other_input_keys_to_keep) + self.other_output_keys_to_keep = self._get_other_output_keys_to_keep(other_output_keys_to_keep) + self.other_keys_to_keep = self.other_output_keys_to_keep + self.other_input_keys_to_keep + ### Abstract methods ### ####################### @@ -280,6 +285,12 @@ def _annotate(self, df_to_annotate: pd.DataFrame, **decoding_kwargs) -> pd.DataF columns_to_annotate = columns_to_annotate + [ c for c in self.other_output_keys_to_keep if c in df_to_annotate.columns ] + # if df_to_annotate "raw_completion" is a dict, put it back to a json string so that you can reparse it + # TODO: this is for backward compatibility, remove in the future + if "raw_completion" in df_to_annotate.columns: + df_to_annotate["raw_completion"] = df_to_annotate["raw_completion"].apply( + lambda x: json.dumps(x) if isinstance(x, dict) else x + ) curr_annotated = self.annotators[annotator]( df_to_annotate.loc[curr_idcs, columns_to_annotate], @@ -421,6 +432,22 @@ def _merge_annotations( return df_to_annotate + def _get_other_input_keys_to_keep(self, other_input_keys_to_keep: Sequence[str]) -> list[str]: + """Get the other input keys to keep, which includes the ones that are needed for the processors.""" + processor_keys_to_keep = [] + for a in self.annotators.values(): + for p in a.processors: + processor_keys_to_keep += p.other_input_keys_to_keep + return list(set(list(other_input_keys_to_keep) + list(processor_keys_to_keep))) + + def _get_other_output_keys_to_keep(self, other_output_keys_to_keep: Sequence[str]) -> list[str]: + """Get the other output keys to keep, which includes the ones that are needed for the processors.""" + processor_keys_to_keep = [] + for a in self.annotators.values(): + for p in a.processors: + processor_keys_to_keep += p.other_output_keys_to_keep + return list(set(list(other_output_keys_to_keep) + list(processor_keys_to_keep))) + ####################### @@ -548,9 +575,6 @@ class SingleAnnotator: completion_key : str, optional Key of the output of `fn_completions` to use for parsing the completions into annotations. - - is_json_load_raw_completions : bool, optional - Whether to try to load the raw completions into a json. If exceptionthen will return the raw completions as is. """ def __init__( @@ -569,7 +593,6 @@ def __init__( processors_to_kwargs: Optional[dict[str, dict]] = None, is_add_default_processors: bool = True, completion_key: str = "completions", - is_json_load_raw_completions: bool = False, ): self.base_dir = Path(base_dir) self.prompt_template = self._get_prompt_template(prompt_template) @@ -588,7 +611,6 @@ def __init__( self.batch_size = batch_size self.annotation_column = annotation_column self.completion_column = "raw_completion" if is_store_raw_completions else None - self.is_json_load_raw_completions = is_json_load_raw_completions self.is_add_default_processors = is_add_default_processors self.processors = [] @@ -605,6 +627,8 @@ def __init__( } for processor, processor_kwargs in processors_to_kwargs.items(): processor_kwargs["seed"] = self.seed + processor_kwargs["annotation_column"] = self.annotation_column + processor_kwargs["completion_column"] = self.completion_column Processor = self._search_processor(processor) self.processors += [Processor(**processor_kwargs)] @@ -744,12 +768,6 @@ def _parse_completions(self, completions: list[str]) -> tuple[list[Any], list[An all_annotations += batch_annotations - if self.is_json_load_raw_completions: - try: - completion = json.loads(completion) - except: - pass - all_completions += [completion] * self.batch_size return all_annotations, all_completions diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_clf_cot_gpt4_turbo/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_clf_cot_gpt4_turbo/configs.yaml index cdc963d0..a93d0e60 100644 --- a/src/alpaca_eval/evaluators_configs/alpaca_eval_clf_cot_gpt4_turbo/configs.yaml +++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_clf_cot_gpt4_turbo/configs.yaml @@ -15,3 +15,5 @@ alpaca_eval_clf_cot_gpt4_turbo: log_prob_index: -1 completion_key: "completions_all" batch_size: 1 + processors_to_kwargs: + ChainOfThoughtProcessor: {} diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml index d0e789b8..f6979d14 100644 --- a/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml +++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml @@ -37,4 +37,5 @@ alpaca_eval_cot_gpt4_turbo_fn: ranking_parser: model_1_name: "m" batch_size: 1 - is_json_load_raw_completions: true + processors_to_kwargs: + ChainOfThoughtProcessor: {} diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml index 2d588466..1aa2b91d 100644 --- a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml +++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml @@ -33,4 +33,3 @@ alpaca_eval_gpt4_fn: annotation_key: "ordered_models" ranking_parser: {} batch_size: 1 - is_json_load_raw_completions: true diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml index 45a51a8b..b7f6be3a 100644 --- a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml +++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml @@ -34,4 +34,3 @@ alpaca_eval_gpt4_turbo_fn: ranking_parser: model_1_name: "m" batch_size: 1 - is_json_load_raw_completions: true diff --git a/src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml index 49efb680..b857a327 100644 --- a/src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml +++ b/src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml @@ -22,4 +22,3 @@ chatgpt_fn: 1: '(?i)output \(a\)' 2: '(?i)output \(b\)' batch_size: 1 - is_json_load_raw_completions: true diff --git a/src/alpaca_eval/processors.py b/src/alpaca_eval/processors.py index 3244b719..a0d92a68 100644 --- a/src/alpaca_eval/processors.py +++ b/src/alpaca_eval/processors.py @@ -6,6 +6,7 @@ """ import abc +import json from typing import Optional, Sequence import numpy as np @@ -13,14 +14,25 @@ from . import utils -__all__ = ["RandomSwitchTwoColumnsProcessor", "PaddingForBatchesProcessor"] +__all__ = ["RandomSwitchTwoColumnsProcessor", "PaddingForBatchesProcessor", "ChainOfThoughtProcessor"] class BaseProcessor(abc.ABC): """Base class for a processor.""" - def __init__(self, seed: int = 123): + # additional input and output keys that should be kept in the annotator + other_input_keys_to_keep = [] + other_output_keys_to_keep = [] + + def __init__( + self, + seed: int = 123, + annotation_column: str = "annotation", + completion_column: str = "raw_completion", + ): self.seed = seed + self.annotation_column = annotation_column + self.completion_column = completion_column @abc.abstractmethod def preprocess(self, df_to_annotate: pd.DataFrame) -> pd.DataFrame: @@ -190,3 +202,78 @@ def preprocess(self, df_to_annotate: pd.DataFrame) -> pd.DataFrame: def postprocess(self, df_annotated: pd.DataFrame) -> pd.DataFrame: return df_annotated[~df_annotated["is_padding"].astype(bool)].drop(columns=["is_padding"]).copy() + + +class ChainOfThoughtProcessor(BaseProcessor): + r"""Processes the raw completions by extracting the chain of thought as a new column + by loading them as a JSON and, if chain of thought is used, adding a dictionary + "referenced_models" to better understand which model names correspond to which outputs in the chain of thought. + + Examples + -------- + >>> raw_completion = '{"concise_explanation": "M is better", "ordered_models": [{"rank": 1, "model": "M"}, {"rank": 2, "model": "m"}]}' + >>> df = pd.DataFrame([dict(preference=2, raw_completion=raw_completion), + ... dict(preference=1, raw_completion=raw_completion)]) + >>> processor = ChainOfThoughtProcessor() + >>> processor.postprocess(df)[["referenced_models", "concise_explanation"]] + referenced_models concise_explanation + 0 {'M': 'output_2', 'm': 'output_1'} M is better + 1 {'M': 'output_1', 'm': 'output_2'} M is better + """ + # those columns should be added to the final result + other_output_keys_to_keep = ["referenced_models", "concise_explanation"] + + def preprocess(self, df_to_annotate: pd.DataFrame) -> pd.DataFrame: + return df_to_annotate + + @property + def _tmp_col(self): + return "json_" + self.completion_column + + def postprocess(self, df_annotated: pd.DataFrame) -> pd.DataFrame: + """Load the raw completion as a JSON and add the referenced models to better understand chain of thought.""" + df_annotated = df_annotated.copy() + + if self.completion_column in df_annotated: + df_annotated[self._tmp_col] = df_annotated[self.completion_column].apply(_try_json_load) + self.add_referenced_model_(df_annotated) + # add the concise explanation + df_annotated["concise_explanation"] = df_annotated[self._tmp_col].apply( + lambda x: x.get("concise_explanation", None) + ) + df_annotated = df_annotated.drop(columns=[self._tmp_col]) + + return df_annotated + + def add_referenced_model_(self, df): + """Add a dictionary to better understand chain of thought in case it's useful""" + df["referenced_models"] = None + + for i, r in df.iterrows(): + if ( + isinstance(r[self._tmp_col], dict) + and "concise_explanation" in r[self._tmp_col] + and "ordered_models" in r[self._tmp_col] + ): + preference = int(df.loc[i, "preference"]) + ordered_models = df.loc[i, self._tmp_col]["ordered_models"] + for m in ordered_models: + if m["rank"] == 1: + first_model = m["model"] + elif m["rank"] == 2: + second_model = m["model"] + else: + assert False + + df.at[i, "referenced_models"] = { + first_model: f"output_{preference}", + second_model: f"output_{3 - preference}", + } + + +def _try_json_load(el): + """Try to load as json""" + try: + return json.loads(el) + except: + return el