Fix price for gpt-4o-mini-2024-07-18

tatsu-lab · Aug 24, 2024 · 3bd2098 · 3bd2098
1 parent bb9d03a
commit 3bd2098
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 23 deletions.
diff --git a/src/alpaca_eval/decoders/openai.py b/src/alpaca_eval/decoders/openai.py
@@ -87,7 +87,9 @@ def openai_completions(
         logging.info("No samples to annotate.")
         return []
     else:
-        logging.info(f"Using `openai_completions` on {n_examples} prompts using {model_name}.")
+        logging.info(
+            f"Using `openai_completions` on {n_examples} prompts using {model_name}."
+        )
 
     if tokens_to_avoid or tokens_to_favor:
         tokenizer = tiktoken.encoding_for_model(model_name)
@@ -97,7 +99,9 @@ def openai_completions(
             for t in tokens_to_avoid:
                 curr_tokens = tokenizer.encode(t)
                 if len(curr_tokens) != 1 and is_skip_multi_tokens_to_avoid:
-                    logging.warning(f"'{t}' has more than one token, skipping because `is_skip_multi_tokens_to_avoid`.")
+                    logging.warning(
+                        f"'{t}' has more than one token, skipping because `is_skip_multi_tokens_to_avoid`."
+                    )
                     continue
                 for tok_id in curr_tokens:
                     logit_bias[tok_id] = -100  # avoids certain tokens
@@ -113,15 +117,19 @@ def openai_completions(
     if is_strip:
         prompts = [p.strip() for p in prompts]
 
-    requires_chatml = decoding_kwargs.pop("requires_chatml", _requires_chatml(model_name))
+    requires_chatml = decoding_kwargs.pop(
+        "requires_chatml", _requires_chatml(model_name)
+    )
     decoding_kwargs["is_chat"] = decoding_kwargs.get("is_chat", requires_chatml)
     if requires_chatml:
         prompts = [utils.prompt_to_chatml(prompt) for prompt in prompts]
         num_procs = num_procs or 2
         batch_size = batch_size or 1
 
         if batch_size > 1:
-            logging.warning("batch_size > 1 is not supported yet for chat models. Setting to 1")
+            logging.warning(
+                "batch_size > 1 is not supported yet for chat models. Setting to 1"
+            )
             batch_size = 1
 
     else:
@@ -130,7 +138,10 @@ def openai_completions(
 
     n_batches = int(math.ceil(n_examples / batch_size))
 
-    prompt_batches = [prompts[batch_id * batch_size : (batch_id + 1) * batch_size] for batch_id in range(n_batches)]
+    prompt_batches = [
+        prompts[batch_id * batch_size : (batch_id + 1) * batch_size]
+        for batch_id in range(n_batches)
+    ]
 
     if isinstance(max_tokens, int):
         max_tokens = [max_tokens] * n_examples
@@ -149,7 +160,9 @@ def openai_completions(
             ]
         else:
             with ThreadPoolExecutor(max_workers=num_procs) as p:
-                partial_completion_helper = functools.partial(_openai_completion_helper, **kwargs)
+                partial_completion_helper = functools.partial(
+                    _openai_completion_helper, **kwargs
+                )
                 completions = list(
                     tqdm.tqdm(
                         p.map(partial_completion_helper, inputs),
@@ -160,7 +173,11 @@ def openai_completions(
     logging.info(f"Completed {n_examples} examples in {t}.")
 
     # flatten the list and select only the text
-    completions_all = [completion for completion_batch in completions for completion in completion_batch]
+    completions_all = [
+        completion
+        for completion_batch in completions
+        for completion in completion_batch
+    ]
     completions_text = [completion["text"] for completion in completions_all]
 
     price = [
@@ -186,9 +203,15 @@ def _openai_completion_helper(
     temperature: Optional[float] = 0.7,
     client_config_path: utils.AnyPath = constants.OPENAI_CLIENT_CONFIG_PATH,  # see `client_configs/README.md`
     # following is only for backward compatibility and should be avoided
-    openai_organization_ids: Optional[Sequence[str]] = constants.OPENAI_ORGANIZATION_IDS,
+    openai_organization_ids: Optional[
+        Sequence[str]
+    ] = constants.OPENAI_ORGANIZATION_IDS,
     openai_api_keys: Optional[Sequence[str]] = constants.OPENAI_API_KEYS,
-    openai_api_base: Optional[str] = os.getenv("OPENAI_API_BASE") if os.getenv("OPENAI_API_BASE") else openai.base_url,
+    openai_api_base: Optional[str] = (
+        os.getenv("OPENAI_API_BASE")
+        if os.getenv("OPENAI_API_BASE")
+        else openai.base_url
+    ),
     ############################
     client_kwargs: Optional[dict[str, Any]] = None,
     n_retries: Optional[int] = 10,
@@ -212,7 +235,9 @@ def _openai_completion_helper(
     # randomly select the client
     client_idcs = range(len(all_clients))
     curr_client_idx = random.choice(client_idcs)
-    logging.info(f"Using OAI client number {curr_client_idx+1} out of {len(client_idcs)}.")
+    logging.info(
+        f"Using OAI client number {curr_client_idx+1} out of {len(client_idcs)}."
+    )
     client = all_clients[curr_client_idx]
 
     # copy shared_kwargs to avoid modifying it
@@ -224,7 +249,9 @@ def _openai_completion_helper(
     for _ in range(n_retries):
         try:
             if is_chat:
-                completion_batch = client.chat.completions.create(messages=prompt_batch[0], **curr_kwargs)
+                completion_batch = client.chat.completions.create(
+                    messages=prompt_batch[0], **curr_kwargs
+                )
 
                 choices = completion_batch.choices
                 for i, choice in enumerate(choices):
@@ -244,24 +271,34 @@ def _openai_completion_helper(
 
                     if choice.message.tool_calls is not None:
                         # currently we only use function calls to get a JSON object => return raw text of json
-                        choices[i]["text"] = choice.message.tool_calls[0].function.arguments
+                        choices[i]["text"] = choice.message.tool_calls[
+                            0
+                        ].function.arguments
 
             else:
-                completion_batch = client.completions.create(prompt=prompt_batch, **curr_kwargs)
+                completion_batch = client.completions.create(
+                    prompt=prompt_batch, **curr_kwargs
+                )
                 choices = completion_batch.choices
                 for i, choice in enumerate(choices):
                     choices[i] = choice.model_dump()
 
             for choice in choices:
-                choice["total_tokens"] = completion_batch.usage.total_tokens / len(prompt_batch)
+                choice["total_tokens"] = completion_batch.usage.total_tokens / len(
+                    prompt_batch
+                )
             break
         except openai.OpenAIError as e:
             logging.warning(f"OpenAIError: {e}.")
             if "Please reduce" in str(e):
                 kwargs["max_tokens"] = int(kwargs["max_tokens"] * 0.8)
-                logging.warning(f"Reducing target length to {kwargs['max_tokens']}, Retrying...")
+                logging.warning(
+                    f"Reducing target length to {kwargs['max_tokens']}, Retrying..."
+                )
                 if kwargs["max_tokens"] == 0:
-                    logging.exception("Prompt is already longer than max context length. Error:")
+                    logging.exception(
+                        "Prompt is already longer than max context length. Error:"
+                    )
                     raise e
             elif "Please try again with a different prompt." in str(e):
                 logging.warning(
@@ -276,12 +313,20 @@ def _openai_completion_helper(
                 if "rate limit" in str(e).lower():
                     logging.warning(f"Hit request rate limit; retrying...")
                 else:
-                    logging.warning(f"Unknown error. \n It's likely a rate limit so we are retrying...")
+                    logging.warning(
+                        f"Unknown error. \n It's likely a rate limit so we are retrying..."
+                    )
                 if len(all_clients) > 1:
-                    curr_client_idx = random.choice([idx for idx in client_idcs if idx != curr_client_idx])
+                    curr_client_idx = random.choice(
+                        [idx for idx in client_idcs if idx != curr_client_idx]
+                    )
                     client = all_clients[curr_client_idx]
-                    logging.info(f"Switching OAI client to client number {curr_client_idx}.")
-                logging.info(f"Sleeping {sleep_time} before retrying to call openai API...")
+                    logging.info(
+                        f"Switching OAI client to client number {curr_client_idx}."
+                    )
+                logging.info(
+                    f"Sleeping {sleep_time} before retrying to call openai API..."
+                )
                 time.sleep(sleep_time)  # Annoying rate limit on requests.
 
     if choices is None:
@@ -295,15 +340,23 @@ def _openai_completion_helper(
 def _requires_chatml(model: str) -> bool:
     """Whether a model requires the ChatML format."""
     # TODO: this should ideally be an OpenAI function... Maybe it already exists?
-    not_chatml = ("instruct" in model) or ("gpt-3" in model and "turbo" not in model) or (model.startswith("text-"))
+    not_chatml = (
+        ("instruct" in model)
+        or ("gpt-3" in model and "turbo" not in model)
+        or (model.startswith("text-"))
+    )
     return not not_chatml
 
 
 def _get_price_per_token(model, price_per_token=None):
     """Returns the price per token for a given model"""
     if price_per_token is not None:
         return float(price_per_token)
-    if "gpt-4-turbo" in model:
+    if "gpt-4o-mini-2024-07-18" in model:
+        return (
+            0.15 / 1_000_000
+        )  # that's not completely true because decoding is 0.03 but close enough given that most is context
+    elif "gpt-4-turbo" in model:
         return 0.01 / 1000
     elif "gpt-4-1106" in model:
         return (

diff --git a/src/alpaca_eval/leaderboards/evaluators/evaluators_leaderboard.csv b/src/alpaca_eval/leaderboards/evaluators/evaluators_leaderboard.csv
@@ -11,7 +11,7 @@ gpt4_turbo_cot_logprob,67.86974910317902,5.397145061728395,1568.9484159171295,0.
 gpt4_turbo_cot_clf,67.59689922480621,5.3972248062015495,1528.4046718706977,0.6666666666666667,0.6326057742256878,,,0.5936794582392777,0.5855855855855856,0.5255813953488373,645,verified
 claude_ranking,67.5925925925926,4.954578395061729,218.4230414438272,0.9,0.90848221004591,,,0.7303370786516854,0.6576576576576577,0.4552469135802468,648,verified
 alpaca_eval_llama3_70b_fn,67.53091913784353,0.41207197526091993,208.69685160402955,0.9,0.8577236113497642,32.25308641975309,8.204334365325078,0.7910112359550562,0.6576576576576577,0.47931967529957475,2587,minimal
-weighted_alpaca_eval_gpt-4o-mini-2024-07-18,67.34955133561454,12.90736111111111,93.54821923706267,0.9833333333333333,0.9389828560875118,32.24432530355238,14.380747136032564,0.7094594594594594,0.6306306306306306,0.5017959384038282,2592,minimal
+weighted_alpaca_eval_gpt-4o-mini-2024-07-18,0.33674775667807266,12.90736111111111,93.54821923706267,0.9833333333333333,0.9389828560875118,32.24432530355238,14.380747136032564,0.7094594594594594,0.6306306306306306,0.5017959384038282,2592,minimal
 gpt4,66.93672839506173,12.452592592592593,1036.788589334915,0.8833333333333333,0.8668599990267735,31.481481481481488,14.621913580246911,0.647191011235955,0.6666666666666666,0.5397376543209877,2592,minimal
 alpaca_farm_greedy_gpt4,66.43518518518519,15.28163425925926,877.6250469425926,0.8499999999999999,0.7481465609199582,30.246913580246915,19.290123456790123,0.597752808988764,0.6486486486486487,0.5362654320987654,2592,minimal
 weighted_alpaca_eval_gpt4_turbo,65.73198824263118,4.323981481481481,227.7462866895061,0.7833333333333333,0.7688872243700914,33.89896126543981,23.652705035108028,0.6058558558558559,0.5727272727272728,0.5282783420419752,2592,minimal