diff --git a/src/alpaca_eval/decoders/openai.py b/src/alpaca_eval/decoders/openai.py index a61900b4..df8a0f6c 100644 --- a/src/alpaca_eval/decoders/openai.py +++ b/src/alpaca_eval/decoders/openai.py @@ -303,7 +303,11 @@ def _get_price_per_token(model, price_per_token=None): """Returns the price per token for a given model""" if price_per_token is not None: return float(price_per_token) - if "gpt-4-turbo" in model: + if "gpt-4o-mini-2024-07-18" in model: + return ( + 0.15 / 1_000_000 + ) # that's not completely true because decoding is 0.03 but close enough given that most is context + elif "gpt-4-turbo" in model: return 0.01 / 1000 elif "gpt-4-1106" in model: return ( diff --git a/src/alpaca_eval/evaluators_configs/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/configs.yaml b/src/alpaca_eval/evaluators_configs/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/configs.yaml new file mode 100644 index 00000000..aa3551b7 --- /dev/null +++ b/src/alpaca_eval/evaluators_configs/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/configs.yaml @@ -0,0 +1,16 @@ +weighted_alpaca_eval_gpt-4o-mini-2024-07-18: + prompt_template: "alpaca_eval_clf_gpt4_turbo/alpaca_eval_clf.txt" + fn_completions: "openai_completions" + completions_kwargs: + model_name: "gpt-4o-mini-2024-07-18" + max_tokens: 1 + temperature: 1 # temperature should be applied for sampling, so that should make no effect. + logprobs: true + top_logprobs: 5 + fn_completion_parser: "logprob_parser" + completion_parser_kwargs: + numerator_token: "m" + denominator_tokens: ["m", "M"] + is_binarize: false + completion_key: "completions_all" + batch_size: 1 diff --git a/src/alpaca_eval/leaderboards/evaluators/evaluators_leaderboard.csv b/src/alpaca_eval/leaderboards/evaluators/evaluators_leaderboard.csv index e4d139d8..859818ed 100644 --- a/src/alpaca_eval/leaderboards/evaluators/evaluators_leaderboard.csv +++ b/src/alpaca_eval/leaderboards/evaluators/evaluators_leaderboard.csv @@ -11,6 +11,7 @@ gpt4_turbo_cot_logprob,67.86974910317902,5.397145061728395,1568.9484159171295,0. gpt4_turbo_cot_clf,67.59689922480621,5.3972248062015495,1528.4046718706977,0.6666666666666667,0.6326057742256878,,,0.5936794582392777,0.5855855855855856,0.5255813953488373,645,verified claude_ranking,67.5925925925926,4.954578395061729,218.4230414438272,0.9,0.90848221004591,,,0.7303370786516854,0.6576576576576577,0.4552469135802468,648,verified alpaca_eval_llama3_70b_fn,67.53091913784353,0.41207197526091993,208.69685160402955,0.9,0.8577236113497642,32.25308641975309,8.204334365325078,0.7910112359550562,0.6576576576576577,0.47931967529957475,2587,minimal +weighted_alpaca_eval_gpt-4o-mini-2024-07-18,0.33674775667807266,12.90736111111111,93.54821923706267,0.9833333333333333,0.9389828560875118,32.24432530355238,14.380747136032564,0.7094594594594594,0.6306306306306306,0.5017959384038282,2592,minimal gpt4,66.93672839506173,12.452592592592593,1036.788589334915,0.8833333333333333,0.8668599990267735,31.481481481481488,14.621913580246911,0.647191011235955,0.6666666666666666,0.5397376543209877,2592,minimal alpaca_farm_greedy_gpt4,66.43518518518519,15.28163425925926,877.6250469425926,0.8499999999999999,0.7481465609199582,30.246913580246915,19.290123456790123,0.597752808988764,0.6486486486486487,0.5362654320987654,2592,minimal weighted_alpaca_eval_gpt4_turbo,65.73198824263118,4.323981481481481,227.7462866895061,0.7833333333333333,0.7688872243700914,33.89896126543981,23.652705035108028,0.6058558558558559,0.5727272727272728,0.5282783420419752,2592,minimal