From c006178d872acc2a95a97e8a7733f85c1c1c4d77 Mon Sep 17 00:00:00 2001 From: Zhuang Li Date: Fri, 17 May 2024 14:15:26 +1000 Subject: [PATCH] llama3 evaluator (#314) --- .../alpaca_eval_fn.txt | 31 +++++++++++++++++++ .../alpaca_eval_llama3_70b_fn/configs.yaml | 13 ++++++++ .../alpaca_eval_llama3_70b_fn_leaderboard.csv | 31 +++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 src/alpaca_eval/evaluators_configs/alpaca_eval_llama3_70b_fn/alpaca_eval_fn.txt create mode 100644 src/alpaca_eval/evaluators_configs/alpaca_eval_llama3_70b_fn/configs.yaml create mode 100644 src/alpaca_eval/leaderboards/data_AlpacaEval/alpaca_eval_llama3_70b_fn_leaderboard.csv diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_llama3_70b_fn/alpaca_eval_fn.txt b/src/alpaca_eval/evaluators_configs/alpaca_eval_llama3_70b_fn/alpaca_eval_fn.txt new file mode 100644 index 00000000..00ae4a9e --- /dev/null +++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_llama3_70b_fn/alpaca_eval_fn.txt @@ -0,0 +1,31 @@ +<|im_start|>system +You are a helpful assistant, that ranks models by the quality of their answers. +<|im_end|> +<|im_start|>user +I want you to create a leaderboard of different of large-language models. To do so, I will give you the instructions (prompts) given to the models, and the responses of two models. Please rank the models based on which responses would be preferred by humans. All inputs and outputs should be python dictionaries. + +Here is the prompt: +{ + "instruction": """{instruction}""", +} + +Here are the outputs of the models: +[ + { + "model": "model_1", + "answer": """{output_1}""" + }, + { + "model": "model_2", + "answer": """{output_2}""" + } +] + +Now please rank the models by the quality of their answers, so that the model with rank 1 has the best output. Then return a list of the model names and ranks, i.e., produce the following output: +[ + {'model': , 'rank': }, + {'model': , 'rank': } +] + +Your response must be a valid Python dictionary and should contain nothing else because we will directly execute it in Python. Please provide the ranking that the majority of humans would give. +<|im_end|> \ No newline at end of file diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_llama3_70b_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_llama3_70b_fn/configs.yaml new file mode 100644 index 00000000..c51dcc5c --- /dev/null +++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_llama3_70b_fn/configs.yaml @@ -0,0 +1,13 @@ +alpaca_eval_llama3_70b_fn: + prompt_template: "alpaca_eval_llama3_70b_fn/alpaca_eval_fn.txt" + fn_completions: "openai_completions" + completions_kwargs: + requires_chatml: True + model_name: "meta-llama/Llama-3-70b-chat-hf" + max_tokens: 100 + temperature: 0 + price_per_token: 9e-7 + client_kwargs: + base_url: 'https://api.together.xyz/v1' + fn_completion_parser: "ranking_parser" + batch_size: 1 diff --git a/src/alpaca_eval/leaderboards/data_AlpacaEval/alpaca_eval_llama3_70b_fn_leaderboard.csv b/src/alpaca_eval/leaderboards/data_AlpacaEval/alpaca_eval_llama3_70b_fn_leaderboard.csv new file mode 100644 index 00000000..e4d139d8 --- /dev/null +++ b/src/alpaca_eval/leaderboards/data_AlpacaEval/alpaca_eval_llama3_70b_fn_leaderboard.csv @@ -0,0 +1,31 @@ +,Human agreement,Price [$/1000 examples],Time [seconds/1000 examples],Spearman corr.,Pearson corr.,Bias,Variance,Proba. prefer longer,Proba. prefer lists,Proba. prefer 1,# parsed,mode +alpaca_eval_gpt4_fn,70.98765432098766,14.471944444444444,5046.056233900002,0.95,0.9447778041206524,27.623456790123456,11.11111111111111,0.750561797752809,0.6756756756756757,0.4799382716049383,2592,verified +improved_aviary_gpt4,69.75308641975309,12.781435185185186,1831.2850013,0.8833333333333333,0.8993690915590962,,,0.7280898876404495,0.7027027027027027,0.4861111111111111,648,verified +alpaca_eval_gpt4,69.1743827160494,13.601944444444444,1455.4169713998845,0.9666666666666668,0.9335485321531084,28.395061728395056,14.621913580246911,0.6831460674157304,0.7297297297297297,0.5011574074074074,2592,minimal +alpaca_eval_clf_cot_gpt4_turbo,68.70109546165884,6.441079812206572,1753.4788411931145,0.9333333333333332,0.7570054666164165,,,0.6863636363636364,0.6545454545454545,0.5352112676056338,639,verified +alpaca_eval_cot_gpt4_turbo_fn,68.63874533448178,6.311349574632637,1988.6012626717545,0.9707197941566388,0.8997919147215918,29.320987654320984,18.435272517819858,0.6696629213483146,0.6126126126126126,0.5232018561484919,2586,minimal +weighted_alpaca_eval_cot_gpt4_turbo,68.45771313115921,6.447465224111284,1869.2926495435856,0.9333333333333332,0.7743167748273401,,,0.6853932584269663,0.6576576576576577,0.5283575514995362,647,verified +aviary_gpt4,68.3641975308642,12.781481481481482,1821.0640311000004,0.9205101496312952,0.9053426857899228,,,0.701123595505618,0.6486486486486487,0.5555555555555556,648,verified +alpaca_eval_gpt4_turbo_fn,68.09413580246913,5.533981481481482,864.3023563021605,0.9333333333333332,0.817290435500228,30.246913580246915,15.625,0.651685393258427,0.6036036036036037,0.5381944444444444,2592,minimal +gpt4_turbo_cot_logprob,67.86974910317902,5.397145061728395,1568.9484159171295,0.6333333333333333,0.6310442120964042,,,0.5932584269662922,0.5855855855855856,0.5285319490509259,648,verified +gpt4_turbo_cot_clf,67.59689922480621,5.3972248062015495,1528.4046718706977,0.6666666666666667,0.6326057742256878,,,0.5936794582392777,0.5855855855855856,0.5255813953488373,645,verified +claude_ranking,67.5925925925926,4.954578395061729,218.4230414438272,0.9,0.90848221004591,,,0.7303370786516854,0.6576576576576577,0.4552469135802468,648,verified +alpaca_eval_llama3_70b_fn,67.53091913784353,0.41207197526091993,208.69685160402955,0.9,0.8577236113497642,32.25308641975309,8.204334365325078,0.7910112359550562,0.6576576576576577,0.47931967529957475,2587,minimal +gpt4,66.93672839506173,12.452592592592593,1036.788589334915,0.8833333333333333,0.8668599990267735,31.481481481481488,14.621913580246911,0.647191011235955,0.6666666666666666,0.5397376543209877,2592,minimal +alpaca_farm_greedy_gpt4,66.43518518518519,15.28163425925926,877.6250469425926,0.8499999999999999,0.7481465609199582,30.246913580246915,19.290123456790123,0.597752808988764,0.6486486486486487,0.5362654320987654,2592,minimal +weighted_alpaca_eval_gpt4_turbo,65.73198824263118,4.323981481481481,227.7462866895061,0.7833333333333333,0.7688872243700914,33.89896126543981,23.652705035108028,0.6058558558558559,0.5727272727272728,0.5282783420419752,2592,minimal +humans,65.66358024691358,300.0,36800.00000000001,1.0,1.0,0.0,34.336419753086425,0.6359550561797753,0.6036036036036037,0.5177469135802468,2592,minimal +gpt4_turbo_clf,65.58641975308642,3.774166666666667,157.86959398549385,0.5666666666666667,0.6056662735192052,,,0.5123595505617977,0.5405405405405406,0.5555555555555556,648,verified +alpaca_eval_clf_gpt4_turbo,65.42635658914729,4.328077519379845,151.46231159178296,0.7166666666666667,0.7351663293324147,,,0.6049661399548533,0.5909090909090909,0.5271317829457365,645,verified +claude,65.31635802469135,3.298695848765433,172.99865933897803,0.9333333333333332,0.9028603896845376,32.407407407407405,18.47993827160494,0.6606741573033708,0.6666666666666666,0.494212962962963,2592,minimal +lmsys_gpt4,65.25848765432099,13.945289351851851,17981.91908101215,0.9833333333333332,0.9656100250020464,31.59722222222222,15.91435185185185,0.7389277389277389,0.6944444444444444,0.4635416666666667,2592,minimal +gpt4_turbo,64.14219474497682,4.165919629057188,185.73029410061824,0.5666666666666667,0.5688213739495881,,,0.5382882882882883,0.5675675675675675,0.571870170015456,647,verified +text_davinci_003,64.0817901234568,8.712680555439814,120.90134619274691,0.8499999999999999,0.8307147459007311,33.796296296296305,22.72376543209876,0.6966292134831461,0.6576576576576577,0.4733796296296295,2592,minimal +gpt4_turbo_logprob,63.51076045576003,3.774166666666667,142.6550541719136,0.6166666666666666,0.6016102512172834,35.53043431362654,17.968493710574844,0.509009009009009,0.5225225225225225,0.5600111475683258,2592,verified +guanaco_33b,62.74944567627494,,910.8929739450112,0.0,0.2495312789260463,,,0.6991150442477876,0.704225352112676,0.4257206208425721,451,verified +improved_lmsys_gpt4,62.34567901234568,13.938055555555556,5397.837981725772,0.9833333333333332,0.9273862641854697,,,0.7534883720930232,0.7117117117117117,0.4490740740740742,648,verified +longest,62.19135802469136,0.0,0.0,0.2666666666666666,0.5604276915228803,37.808641975308646,0.0,1.0,0.8828828828828829,0.4166666666666667,2592,minimal +chatgpt_fn,59.992283950617285,1.0088333333333337,529.928419875,0.75,0.8270316070156506,36.88271604938272,27.739197530864203,0.6247191011235955,0.6216216216216216,0.4911265432098766,2592,verified +alpaca_farm,57.80525502318392,11.978385883565174,1312.895122694532,0.5272012675161055,0.6048080773927609,,,0.5900900900900901,0.5636363636363636,0.5100463678516229,647,verified +chatgpt,57.28201740503198,0.8342726921591347,284.9753823429895,0.7166666666666667,0.7136212819980075,39.35185185185186,34.054591087228026,0.5910112359550562,0.5945945945945946,0.488991888760139,2589,minimal +cohere,56.60964230171073,6.485108864696734,503.1591360234836,0.2166666666666666,0.4349894801752539,,,0.6281179138321995,0.6486486486486487,0.4603421461897357,643,verified