diff --git a/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv index 0daaa6eb..dae10ee2 100644 --- a/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv +++ b/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv @@ -15,6 +15,7 @@ SPPO-Gemma-2-9B-It-PairRM,53.96983730150777,48.23404468746583,1803,https://huggi Llama-3-Instruct-8B-WPO-HB-v2,53.37264268894168,57.33198613024009,2472,https://huggingface.co/wzhouad/Llama3-Instruct-8B-WPO-HB-v2,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Llama-3-Instruct-8B-WPO-HB-v2/model_outputs.json,community Claude 3.5 Sonnet (06/20),52.36675427146999,40.56021409682828,1488,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/claude-3-5-sonnet-20240620/model_outputs.json,community Yi-Large Preview,51.894415134099546,57.46724251946292,2335,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/yi-large-preview/model_outputs.json,verified +GPT-4o Mini (07/18),50.727144855901976,44.65413862507926,1861,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt-4o-mini-2024-07-18/model_outputs.json,minimal Storm-7B,50.45110959343775,50.26886905528583,2045,https://huggingface.co/jieliu/Storm-7B,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Storm-7B/model_outputs.json,community GPT-4 Preview (11/06),50.0,50.0,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_1106_preview/model_outputs.json,minimal Infinity-Instruct-7M-Gen-Llama3_1-70B,46.10043331712677,37.46327383827497,1654,https://huggingface.co/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Infinity-Instruct-7M-Gen-Llama3_1-70B/model_outputs.json,community @@ -29,6 +30,7 @@ Infinity-Instruct-7M-Gen-mistral-7B,39.66949964831439,34.347412485016434,1742,ht Llama 3.1 405B Instruct,39.25732749961743,39.10666895419877,1988,https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Meta-Llama-3.1-405B-Instruct-Turbo/model_outputs.json,minimal SPPO-Llama-3-Instruct-8B-PairRM,38.56280663670214,39.67286090605648,2066,https://huggingface.co/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/SPPO-Llama-3-Instruct-8B-PairRM/model_outputs.json,community GPT-4,38.12808974440021,23.576789314782605,1365,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4/model_outputs.json,verified +Qwen2 72B Instruct,38.07461345451606,29.8527557752399,1626,https://huggingface.co/Qwen/Qwen2-72B-Instruct,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Qwen2-72B-Instruct/model_outputs.json,verified Llama 3.1 70B Instruct,38.05512453607286,39.12691443804968,2044,https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Meta-Llama-3.1-70B-Instruct-Turbo/model_outputs.json,minimal Infinity-Instruct-3M-0625-Llama3-70B,37.97881098506053,24.277231851026183,1294,https://huggingface.co/BAAI/Infinity-Instruct-3M-0625-Llama3-70B,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Infinity-Instruct-3M-0625-Llama3-70B/model_outputs.json,community Aligner 2B+Qwen1.5 72B Chat,36.725868878524274,31.773037737123104,1812,https://github.com/AlignInc/aligner-replication,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/aligner-2b_qwen1.5-72b-chat/model_outputs.json,community @@ -82,6 +84,7 @@ Infinity-Instruct-3M-0625-Qwen2-7B,21.87399673499932,15.322182555525842,1315,htt PairRM 0.4B+Tulu 2+DPO 70B (best-of-16),21.428403975507223,18.638962967441,1607,https://huggingface.co/llm-blender/PairRM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/pairrm-tulu-2-70b/model_outputs.json,community Tulu 2+DPO 70B,21.238610038371124,15.982854374136648,1418,https://huggingface.co/allenai/tulu-2-dpo-70b,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/tulu-2-dpo-70b/model_outputs.json,verified Llama 3.1 8B Instruct,20.85398744758185,21.841523410839937,2181,https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Meta-Llama-3.1-8B-Instruct-Turbo/model_outputs.json,minimal +Mistral 7B v0.3,20.61004837179779,16.693179605176876,1581,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Mistral-7B-Instruct-v0.3/model_outputs.json,verified Mistral-7B-ReMax-v0.1,20.55136770233589,15.999331369031056,1478,https://huggingface.co/ziniuli/Mistral-7B-ReMax-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Mistral-7B-ReMax-v0.1/model_outputs.json,community Infinity-Instruct-3M-0625-Yi-1.5-9B,20.538372631222003,16.203844277153284,1449,https://huggingface.co/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Infinity-Instruct-3M-0625-Yi-1.5-9B/model_outputs.json,community ExPO + Starling LM 7B alpha,19.4741654606294,18.17975592036216,1821,https://huggingface.co/chujiezheng/Starling-LM-7B-alpha-ExPO,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Starling-LM-7B-alpha-ExPO/model_outputs.json,community