Skip to content

Commit

Permalink
[ENH] add referenced_models locally (#224)
Browse files Browse the repository at this point in the history
* [ENH] add referenced_models locally

* [TEST] pass all tests

* [BUG] save referenced_models

* [BUG] allow the use of is_reapply_parsing with json raw_completion
  • Loading branch information
YannDubs authored Feb 1, 2024
1 parent 3abbec9 commit 1fec906
Show file tree
Hide file tree
Showing 9 changed files with 129 additions and 65 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Details in [limitations](#limitations).
- [Data Release](#data-release)
- [Differences with AlpacaFarm](#differences-with-alpacafarm)
- [Related work](#related-work)
- [Interpreting Chain of Thought](#interpreting-chain-of-thought-when-available)
- [Major updates](#major-updates)

</details>
Expand Down Expand Up @@ -1253,9 +1254,9 @@ For example:
<details>
<summary><h2 tabindex="-1" dir="auto">Interpreting Chain of Thought, when available</h2></summary>

For some annotators, e.g. `alpaca_eval_cot_gpt4_turbo_fn` we use chan of thought reasoning to make the models preferences more interpretable. Those can then be found under `raw_annotations` in the `annotations.json` file. We describe how to interpret those below.
For some annotators, e.g. `alpaca_eval_cot_gpt4_turbo_fn` we use chan of thought reasoning to make the models preferences more interpretable. Those can then be found under `concise_explanation` in the `annotations.json` file. To interpret them, you should also look at `referenced_models` which translates the temporary model name (in the prompt) to the actual output. Below, we provide more explanation as to what is happening behind the scenes.

To better understand the auto-annotations, you can check the `raw_annotations["concise_explanation]` column in `annotations.json` (e.g. [here](https://github.com/tatsu-lab/alpaca_eval/tree/main/results/gpt4/alpaca_eval_cot_gpt4_turbo_fn/annotations.json)) which contains the chain of thought reasoning of the auto annotator. Note that the raw_annotations is not modified by the randomization of the order of the outputs. In particular, `"m"` and `"M"` can sometime refer to the first model (the reference) and sometime to the second model (the model being evaluated). To understand which model is being referred to, you should use the column `preference` and `ordered_models`. When pushing the annotations to GitHub we use those to add a dictionary `"referenced_models"` mapping the model names to the corresponding outputs (see [`add_referenced_model_`](https://github.com/tatsu-lab/alpaca_eval/blob/main/docs/format_sample_sheets.py)). For example in the following annotation we see that the preference is 1.0 (i.e. `output_1`) and corresponds to model `M` in `concise_explanation` (see `ordered_models`).
ou can check the `raw_annotations["concise_explanation]` column in `annotations.json` (e.g. [here](https://github.com/tatsu-lab/alpaca_eval/tree/main/results/gpt4/alpaca_eval_cot_gpt4_turbo_fn/annotations.json)) which contains the chain of thought reasoning of the auto annotator. Note that the raw_annotations is not modified by the randomization of the order of the outputs. In particular, `"m"` and `"M"` can sometime refer to the first model (the reference) and sometime to the second model (the model being evaluated). To understand which model is being referred to, you should use the column `preference` and `ordered_models`. To make it easier we add a column `"referenced_models"` mapping the model names to the corresponding outputs. For example in the following annotation we see that the preference is 1.0 (i.e. `output_1`) and corresponds to model `M` in `concise_explanation` (see `ordered_models`).

```json
{
Expand Down
42 changes: 0 additions & 42 deletions docs/format_sample_sheets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
from pathlib import Path

import pandas as pd
Expand All @@ -9,42 +8,6 @@
RESULTS_DIR = CURRENT_DIR / "results"


def json_load(el):
"""Try to load as json"""
try:
return json.loads(el)
except:
return el


def add_referenced_model_(df):
"""Add a dictionary to better understand chain of thought in case it's useful"""

for i, r in df.iterrows():
if (
isinstance(r["raw_completion"], dict)
and "concise_explanation" in r["raw_completion"]
and "ordered_models" in r["raw_completion"]
):
preference = int(df.loc[i, "preference"])
ordered_models = df.loc[i, "raw_completion"]["ordered_models"]
for m in ordered_models:
if m["rank"] == 1:
first_model = m["model"]
elif m["rank"] == 2:
second_model = m["model"]
else:
assert False

if "referenced_models" not in df.columns:
df["referenced_models"] = None

df.at[i, "referenced_models"] = {
first_model: f"output_{preference}",
second_model: f"output_{3-preference}",
}


df_reference = pd.read_json(RESULTS_DIR / "text_davinci_003" / F_OUTPUTS, orient="records")


Expand All @@ -71,9 +34,4 @@ def add_referenced_model_(df):
# df["order"] = df.apply(lambda row: order[(row["dataset"], row["instruction"])], axis=1)
# df = df.sort_values("order").drop("order", axis=1)

# jsonify & add the referenced models
if "raw_completion" in df:
df["raw_completion"] = df["raw_completion"].apply(json_load)
add_referenced_model_(df)

df.to_json(f, orient="records", indent=2)
48 changes: 33 additions & 15 deletions src/alpaca_eval/annotators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,11 @@ def __init__(
annotators_config: Union[utils.AnyPath, list[dict[str, Any]]] = "claude",
seed: Optional[int] = 0,
is_avoid_reannotations: bool = True,
other_output_keys_to_keep: Sequence[str] = ("price_per_example", "time_per_example", "raw_completion"),
other_output_keys_to_keep: Sequence[str] = (
"price_per_example",
"time_per_example",
"raw_completion",
),
other_input_keys_to_keep: Sequence[str] = (),
is_store_missing_annotations: bool = True,
base_dir: Optional[utils.AnyPath] = None,
Expand All @@ -102,9 +106,6 @@ def __init__(
self.is_avoid_reannotations = is_avoid_reannotations
self.primary_keys = list(primary_keys)
self.all_keys = self.primary_keys + [self.annotator_column]
self.other_output_keys_to_keep = list(other_output_keys_to_keep)
self.other_input_keys_to_keep = list(other_input_keys_to_keep)
self.other_keys_to_keep = self.other_output_keys_to_keep + self.other_input_keys_to_keep
self.is_store_missing_annotations = is_store_missing_annotations
self.is_raise_if_missing_primary_keys = is_raise_if_missing_primary_keys
if isinstance(annotation_type, str):
Expand All @@ -116,6 +117,10 @@ def __init__(
self.annotators = self._initialize_annotators()
self.df_annotations = None

self.other_input_keys_to_keep = self._get_other_input_keys_to_keep(other_input_keys_to_keep)
self.other_output_keys_to_keep = self._get_other_output_keys_to_keep(other_output_keys_to_keep)
self.other_keys_to_keep = self.other_output_keys_to_keep + self.other_input_keys_to_keep

### Abstract methods ###

#######################
Expand Down Expand Up @@ -280,6 +285,12 @@ def _annotate(self, df_to_annotate: pd.DataFrame, **decoding_kwargs) -> pd.DataF
columns_to_annotate = columns_to_annotate + [
c for c in self.other_output_keys_to_keep if c in df_to_annotate.columns
]
# if df_to_annotate "raw_completion" is a dict, put it back to a json string so that you can reparse it
# TODO: this is for backward compatibility, remove in the future
if "raw_completion" in df_to_annotate.columns:
df_to_annotate["raw_completion"] = df_to_annotate["raw_completion"].apply(
lambda x: json.dumps(x) if isinstance(x, dict) else x
)

curr_annotated = self.annotators[annotator](
df_to_annotate.loc[curr_idcs, columns_to_annotate],
Expand Down Expand Up @@ -421,6 +432,22 @@ def _merge_annotations(

return df_to_annotate

def _get_other_input_keys_to_keep(self, other_input_keys_to_keep: Sequence[str]) -> list[str]:
"""Get the other input keys to keep, which includes the ones that are needed for the processors."""
processor_keys_to_keep = []
for a in self.annotators.values():
for p in a.processors:
processor_keys_to_keep += p.other_input_keys_to_keep
return list(set(list(other_input_keys_to_keep) + list(processor_keys_to_keep)))

def _get_other_output_keys_to_keep(self, other_output_keys_to_keep: Sequence[str]) -> list[str]:
"""Get the other output keys to keep, which includes the ones that are needed for the processors."""
processor_keys_to_keep = []
for a in self.annotators.values():
for p in a.processors:
processor_keys_to_keep += p.other_output_keys_to_keep
return list(set(list(other_output_keys_to_keep) + list(processor_keys_to_keep)))

#######################


Expand Down Expand Up @@ -548,9 +575,6 @@ class SingleAnnotator:
completion_key : str, optional
Key of the output of `fn_completions` to use for parsing the completions into annotations.
is_json_load_raw_completions : bool, optional
Whether to try to load the raw completions into a json. If exceptionthen will return the raw completions as is.
"""

def __init__(
Expand All @@ -569,7 +593,6 @@ def __init__(
processors_to_kwargs: Optional[dict[str, dict]] = None,
is_add_default_processors: bool = True,
completion_key: str = "completions",
is_json_load_raw_completions: bool = False,
):
self.base_dir = Path(base_dir)
self.prompt_template = self._get_prompt_template(prompt_template)
Expand All @@ -588,7 +611,6 @@ def __init__(
self.batch_size = batch_size
self.annotation_column = annotation_column
self.completion_column = "raw_completion" if is_store_raw_completions else None
self.is_json_load_raw_completions = is_json_load_raw_completions

self.is_add_default_processors = is_add_default_processors
self.processors = []
Expand All @@ -605,6 +627,8 @@ def __init__(
}
for processor, processor_kwargs in processors_to_kwargs.items():
processor_kwargs["seed"] = self.seed
processor_kwargs["annotation_column"] = self.annotation_column
processor_kwargs["completion_column"] = self.completion_column
Processor = self._search_processor(processor)
self.processors += [Processor(**processor_kwargs)]

Expand Down Expand Up @@ -744,12 +768,6 @@ def _parse_completions(self, completions: list[str]) -> tuple[list[Any], list[An

all_annotations += batch_annotations

if self.is_json_load_raw_completions:
try:
completion = json.loads(completion)
except:
pass

all_completions += [completion] * self.batch_size
return all_annotations, all_completions

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ alpaca_eval_clf_cot_gpt4_turbo:
log_prob_index: -1
completion_key: "completions_all"
batch_size: 1
processors_to_kwargs:
ChainOfThoughtProcessor: {}
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ alpaca_eval_cot_gpt4_turbo_fn:
ranking_parser:
model_1_name: "m"
batch_size: 1
is_json_load_raw_completions: true
processors_to_kwargs:
ChainOfThoughtProcessor: {}
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,3 @@ alpaca_eval_gpt4_fn:
annotation_key: "ordered_models"
ranking_parser: {}
batch_size: 1
is_json_load_raw_completions: true
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,3 @@ alpaca_eval_gpt4_turbo_fn:
ranking_parser:
model_1_name: "m"
batch_size: 1
is_json_load_raw_completions: true
1 change: 0 additions & 1 deletion src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,3 @@ chatgpt_fn:
1: '(?i)output \(a\)'
2: '(?i)output \(b\)'
batch_size: 1
is_json_load_raw_completions: true
91 changes: 89 additions & 2 deletions src/alpaca_eval/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,33 @@
"""

import abc
import json
from typing import Optional, Sequence

import numpy as np
import pandas as pd

from . import utils

__all__ = ["RandomSwitchTwoColumnsProcessor", "PaddingForBatchesProcessor"]
__all__ = ["RandomSwitchTwoColumnsProcessor", "PaddingForBatchesProcessor", "ChainOfThoughtProcessor"]


class BaseProcessor(abc.ABC):
"""Base class for a processor."""

def __init__(self, seed: int = 123):
# additional input and output keys that should be kept in the annotator
other_input_keys_to_keep = []
other_output_keys_to_keep = []

def __init__(
self,
seed: int = 123,
annotation_column: str = "annotation",
completion_column: str = "raw_completion",
):
self.seed = seed
self.annotation_column = annotation_column
self.completion_column = completion_column

@abc.abstractmethod
def preprocess(self, df_to_annotate: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -190,3 +202,78 @@ def preprocess(self, df_to_annotate: pd.DataFrame) -> pd.DataFrame:

def postprocess(self, df_annotated: pd.DataFrame) -> pd.DataFrame:
return df_annotated[~df_annotated["is_padding"].astype(bool)].drop(columns=["is_padding"]).copy()


class ChainOfThoughtProcessor(BaseProcessor):
r"""Processes the raw completions by extracting the chain of thought as a new column
by loading them as a JSON and, if chain of thought is used, adding a dictionary
"referenced_models" to better understand which model names correspond to which outputs in the chain of thought.
Examples
--------
>>> raw_completion = '{"concise_explanation": "M is better", "ordered_models": [{"rank": 1, "model": "M"}, {"rank": 2, "model": "m"}]}'
>>> df = pd.DataFrame([dict(preference=2, raw_completion=raw_completion),
... dict(preference=1, raw_completion=raw_completion)])
>>> processor = ChainOfThoughtProcessor()
>>> processor.postprocess(df)[["referenced_models", "concise_explanation"]]
referenced_models concise_explanation
0 {'M': 'output_2', 'm': 'output_1'} M is better
1 {'M': 'output_1', 'm': 'output_2'} M is better
"""
# those columns should be added to the final result
other_output_keys_to_keep = ["referenced_models", "concise_explanation"]

def preprocess(self, df_to_annotate: pd.DataFrame) -> pd.DataFrame:
return df_to_annotate

@property
def _tmp_col(self):
return "json_" + self.completion_column

def postprocess(self, df_annotated: pd.DataFrame) -> pd.DataFrame:
"""Load the raw completion as a JSON and add the referenced models to better understand chain of thought."""
df_annotated = df_annotated.copy()

if self.completion_column in df_annotated:
df_annotated[self._tmp_col] = df_annotated[self.completion_column].apply(_try_json_load)
self.add_referenced_model_(df_annotated)
# add the concise explanation
df_annotated["concise_explanation"] = df_annotated[self._tmp_col].apply(
lambda x: x.get("concise_explanation", None)
)
df_annotated = df_annotated.drop(columns=[self._tmp_col])

return df_annotated

def add_referenced_model_(self, df):
"""Add a dictionary to better understand chain of thought in case it's useful"""
df["referenced_models"] = None

for i, r in df.iterrows():
if (
isinstance(r[self._tmp_col], dict)
and "concise_explanation" in r[self._tmp_col]
and "ordered_models" in r[self._tmp_col]
):
preference = int(df.loc[i, "preference"])
ordered_models = df.loc[i, self._tmp_col]["ordered_models"]
for m in ordered_models:
if m["rank"] == 1:
first_model = m["model"]
elif m["rank"] == 2:
second_model = m["model"]
else:
assert False

df.at[i, "referenced_models"] = {
first_model: f"output_{preference}",
second_model: f"output_{3 - preference}",
}


def _try_json_load(el):
"""Try to load as json"""
try:
return json.loads(el)
except:
return el

0 comments on commit 1fec906

Please sign in to comment.