diff --git a/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv index 137515b5..ea4b1a50 100644 --- a/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv +++ b/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv @@ -6,12 +6,14 @@ Qwen1.5 72B Chat,36.571754111987296,26.49828339562733,1549,https://huggingface.c GPT-4 0314,35.30706121640206,22.073258928708075,1371,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_0314/model_outputs.json,verified Claude 3 Sonnet (02/29),34.87247436243302,25.556325292273296,1420,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/claude-3-sonnet-20240229/model_outputs.json,verified Mistral Large (24/02),32.65207998531868,21.43877598137888,1362,https://mistral.ai/news/la-plateforme/,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/mistral-large-2402/model_outputs.json,minimal +Samba CoE v0.2 (best-of-16),31.506544268148147,26.988254318335404,1578,https://huggingface.co/spaces/sambanovasystems/Samba-CoE-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Samba-CoE-v0.2-best-of-16/model_outputs.json,community GPT-4 0613,30.18332231673423,15.75503808763975,1140,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_0613/model_outputs.json,verified Snorkel (Mistral-PairRM-DPO+best-of-16),29.974321613074405,34.8601328912795,2616,https://huggingface.co/snorkelai/Snorkel-Mistral-PairRM-DPO,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Snorkel-Mistral-PairRM-DPO-best-of-16/model_outputs.json,community Contextual AI (KTO-Mistral-PairRM),29.705808939683976,33.227355200024846,2521,https://huggingface.co/ContextualAI/Contextual_KTO_Mistral_PairRM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Contextual-KTO-Mistral-PairRM/model_outputs.json,verified PairRM 0.4B+Yi-34B-Chat (best-of-16),28.81484086684313,31.24128294680746,2195,https://huggingface.co/llm-blender/PairRM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/pairrm-Yi-34B-Chat/model_outputs.json,community Mistral Medium,28.614337401726104,21.855772543652176,1500,https://mistral.ai/news/la-plateforme/,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/mistral-medium/model_outputs.json,verified Claude 2,28.155196141629148,17.188240356708075,1069,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/claude-2/model_outputs.json,verified +Samba CoE v0.2,27.62426735006872,21.847378669267083,1469,https://huggingface.co/spaces/sambanovasystems/Samba-CoE-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Samba-CoE-v0.2/model_outputs.json,community Claude,27.289504443727107,16.98534361236025,1082,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/claude/model_outputs.json,verified Yi 34B Chat,27.19054787762733,29.65994671879504,2123,https://huggingface.co/01-ai/Yi-34B-Chat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Yi-34B-Chat/model_outputs.json,verified Snorkel (Mistral-PairRM-DPO),26.39144645733206,30.220052700671644,2736,https://huggingface.co/snorkelai/Snorkel-Mistral-PairRM-DPO,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Snorkel-Mistral-PairRM-DPO/model_outputs.json,community @@ -21,6 +23,7 @@ XwinLM 70b V0.1,24.649686057119272,21.812957073875776,1775,https://github.com/Xw Gemini Pro,24.38177610802152,18.177644540571432,1456,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gemini-pro/model_outputs.json,community Mixtral 8x7B v0.1,23.68848260134481,18.25531762637268,1465,https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Mixtral-8x7B-Instruct-v0.1/model_outputs.json,minimal Evo v2 7B,23.35770570204821,20.834113022583853,1754,https://evolusion.ai,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/evo-v2-7b/model_outputs.json,community +Samba CoE v0.1,22.865837334795227,16.835501870062114,1316,https://huggingface.co/spaces/sambanovasystems/Samba-CoE-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Samba-CoE-v0.1/model_outputs.json,community GPT 3.5 Turbo 0613,22.720189163383225,14.13239070746584,1328,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt-3.5-turbo-16k-0613/model_outputs.json,verified GPT 3.5 Turbo 0613,22.35251298054288,14.09579857390062,1331,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt-3.5-turbo-0613/model_outputs.json,community PairRM 0.4B+Tulu 2+DPO 70B (best-of-16),21.428403975507223,18.638962967441,1607,https://huggingface.co/llm-blender/PairRM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/pairrm-tulu-2-70b/model_outputs.json,community @@ -44,6 +47,7 @@ DEITA 7B v1.0,16.05901353966741,12.646639472385097,1417,https://github.com/hkust JinaChat,15.866004049505932,7.786130393366459,676,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/jina-chat/model_outputs.json,community CausalLM-14B,15.72032518895564,11.146160869950313,1391,https://huggingface.co/CausalLM/14B,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/causallm-14b/model_outputs.json,community PairRM 0.4B+Zephyr 7B Beta (best-of-16),15.529867294986612,12.84127825562733,1487,https://huggingface.co/llm-blender/PairRM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/pairrm-zephyr-7b-beta/model_outputs.json,community +Mistral-ORPO-Beta,14.716749430705242,12.565408794559003,1636,https://huggingface.co/kaist-ai/mistral-orpo-beta,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/mistral-orpo-beta/model_outputs.json,community Starling LM 7B alpha,14.690471079424972,14.24592352162733,1895,https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Starling-LM-7B-alpha/model_outputs.json,community LLaMA2 Chat 70B,14.689648588392544,13.88825834374378,1790,https://ai.meta.com/llama/,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/llama-2-70b-chat-hf/model_outputs.json,verified OpenChat V3.1 13B,14.50338795683784,11.082230489416148,1484,https://github.com/imoneoi/openchat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/openchat-v3.1-13b/model_outputs.json,community diff --git a/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv index 2a7523d7..5af08670 100644 --- a/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv +++ b/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv @@ -9,6 +9,7 @@ gpt4_0314,22.073258928708075,1.2466725494608204,172,627,6,805,21.73913043478261, claude-3-sonnet-20240229,25.556325292273296,1.3419811051815638,193,608,4,805,24.22360248447205,verified,1420,34.87247436243302 gpt4_0613_verbose,23.237360043453418,1.283539505582624,171,630,4,805,21.490683229813666,dev,1473,33.82126688658535 mistral-large-2402,21.43877598137888,1.2485232545097724,166,638,1,805,20.6832298136646,minimal,1362,32.65207998531868 +Samba-CoE-v0.2-best-of-16,26.988254318335404,1.3189030000371738,201,601,3,805,25.15527950310559,community,1578,31.506544268148147 claude-2.1_verbose,24.35407109006212,1.293586209982439,191,613,1,805,23.7888198757764,dev,1414,30.29117916664986 gpt4_0613,15.75503808763975,1.0754642482396215,117,684,4,805,14.782608695652174,verified,1140,30.18332231673423 Snorkel-Mistral-PairRM-DPO-best-of-16,34.8601328912795,1.3599450436840308,270,533,2,805,33.66459627329193,community,2616,29.974321613074405 @@ -16,6 +17,7 @@ Contextual-KTO-Mistral-PairRM,33.227355200024846,1.3779687477923963,260,544,1,80 pairrm-Yi-34B-Chat,31.24128294680746,1.34824373994879,239,563,3,805,29.87577639751553,community,2195,28.81484086684313 mistral-medium,21.855772543652176,1.2682402187223842,164,639,2,805,20.496894409937887,verified,1500,28.614337401726104 claude-2,17.188240356708075,1.17482825615589,131,673,1,805,16.335403726708076,verified,1069,28.155196141629148 +Samba-CoE-v0.2,21.847378669267083,1.2171089783436106,159,645,1,805,19.81366459627329,community,1469,27.62426735006872 claude,16.98534361236025,1.1687959793014906,129,676,0,805,16.024844720496894,verified,1082,27.289504443727107 Yi-34B-Chat,29.65994671879504,1.3225712597906096,219,582,4,805,27.45341614906832,verified,2123,27.19054787762733 Snorkel-Mistral-PairRM-DPO,30.220052700671644,1.3328273012530358,231,572,1,804,28.79353233830846,community,2736,26.39144645733206 @@ -27,6 +29,7 @@ Mixtral-8x7B-Instruct-v0.1,18.25531762637268,1.1885585968848205,135,668,2,805,16 evo-v2-7b,20.834113022583853,1.2159901798146158,158,644,3,805,19.81366459627329,community,1754,23.35770570204821 Mixtral-8x7B-Instruct-v0.1_verbose,24.61406305018634,1.2975757385881228,194,609,2,805,24.22360248447205,dev,2083,23.223120780856064 Mixtral-8x7B-Instruct-v0.1_concise,13.744040154795034,1.071868299237546,105,700,0,805,13.043478260869565,dev,910,22.962609472758643 +Samba-CoE-v0.1,16.835501870062114,1.1180386124646702,124,680,1,805,15.46583850931677,community,1316,22.865837334795227 gpt-3.5-turbo-16k-0613,14.13239070746584,1.027579400264853,96,704,5,805,12.236024844720497,verified,1328,22.720189163383225 gpt-3.5-turbo-0613,14.09579857390062,1.0371186215049395,99,700,6,805,12.670807453416147,community,1331,22.35251298054288 gpt-3.5-turbo-1106_verbose,12.76316981026087,1.044246819212278,94,709,2,805,11.801242236024844,dev,1058,22.00093702171442 @@ -54,6 +57,7 @@ jina-chat,7.786130393366459,0.8398450575524877,59,743,3,805,7.515527950310559,co gpt-3.5-turbo-1106_concise,7.41586497762733,0.8374438113826953,57,744,4,805,7.329192546583851,dev,431,15.769520983894386 causallm-14b,11.146160869950313,0.9544127300795228,81,720,4,805,10.31055900621118,community,1391,15.72032518895564 pairrm-zephyr-7b-beta,12.84127825562733,1.0535874941903722,98,706,1,805,12.236024844720497,community,1487,15.529867294986612 +mistral-orpo-beta,12.565408794559003,0.9929774686147969,95,707,3,805,11.987577639751551,community,1636,14.716749430705242 Starling-LM-7B-alpha,14.24592352162733,1.0685460609395083,102,702,1,805,12.732919254658384,community,1895,14.690471079424972 llama-2-70b-chat-hf,13.88825834374378,1.079984772728814,104,700,0,804,12.935323383084576,verified,1790,14.689648588392544 openchat-v3.1-13b,11.082230489416148,0.9501308701291292,80,720,5,805,10.248447204968944,community,1484,14.50338795683784 @@ -66,7 +70,7 @@ humpback-llama-65b,9.425139047801242,0.9300866722901956,70,734,1,805,8.757763975 openbuddy-llama2-70b-v10.1,8.096422096285714,0.8498371493561294,57,744,4,805,7.329192546583851,community,1077,12.572173272324846 openbuddy-llama-65b-v8,8.77065015089441,0.8871992619444647,64,738,3,805,8.136645962732919,community,1162,12.469356289070015 Qwen-14B-Chat,7.502333484720497,0.8147265702205473,57,742,6,805,7.453416149068323,community,1013,12.378741790737235 -gpt4_gamed,3.7383373713788823,0.6278799633668313,32,771,2,805,4.099378881987578,community,68,12.188764057640531 +gpt4_gamed,3.7383373713788814,0.6278799633668313,32,771,2,805,4.099378881987578,community,68,12.188764057640531 cut-13b,10.779089202496897,0.9428953578911924,83,721,1,805,10.372670807453416,community,1637,12.154781753927743 openchat-v2-w-13b,9.615344158447204,0.8908241710735803,67,736,2,805,8.4472049689441,community,1566,12.03042777097436 tulu-2-dpo-13b,10.119788388347828,0.929813366016608,75,728,2,805,9.440993788819876,community,1614,11.554479428088396