tatsu-lab · YannDubs · Sep 30, 2023 · Sep 25, 2023 · Sep 30, 2023 · Sep 30, 2023
diff --git a/results/ultralm-13b-best-of-16/model_outputs.json b/results/ultralm-13b-best-of-16/model_outputs.json
diff --git a/results/ultralm-13b-v2.0-best-of-16/model_outputs.json b/results/ultralm-13b-v2.0-best-of-16/model_outputs.json
diff --git a/results/ultralm-13b-v2.0/model_output.json b/results/ultralm-13b-v2.0/model_output.json
diff --git a/src/alpaca_eval/leaderboards/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv b/src/alpaca_eval/leaderboards/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv
@@ -2,7 +2,9 @@
 xwinlm-70b-v0.1,95.56803995006244,0.7249419256764628,765,35,1,801,community,1775.0
 gpt4,95.27950310559004,0.716281440286153,761,32,12,805,minimal,1365.0
 llama-2-70b-chat-hf,92.66169154228857,0.911762258320568,743,57,4,804,minimal,1790.0
+ultralm-13b-v2.0-best-of-16,92.29813664596274,0.9402998068253294,743,62,0,805,community,1720.0
 xwinlm-13b-v0.1,91.76029962546816,0.9681394385222166,734,65,2,801,community,1894.0
+ultralm-13b-best-of-16,91.54228855721394,0.981927769109018,736,68,0,804,community,1980.0
 claude-2,91.35572139303484,0.9897323784630048,734,69,1,804,minimal,1069.0
 openchat-v3.1-13b,89.49004975124379,1.076875474505156,718,83,3,804,community,1484.0
 chatgpt,89.36567164179104,1.0789487022114888,716,83,5,804,minimal,827.0
@@ -17,6 +19,7 @@ openbuddy-llama-65b-v8,86.53366583541147,1.2029182403474274,693,107,2,802,commun
 wizardlm-13b-v1.1,86.31840796019901,1.2063217831272972,692,108,4,804,community,1525.0
 openchat-v2-13b,84.96894409937889,1.2572979835605944,683,120,2,805,community,1564.0
 humpback-llama-65b,83.70646766169155,1.3071034735987248,672,130,2,804,community,1269.0
+ultralm-13b-v2.0,83.60248447204968,1.30578174546824,673,132,0,805,community,1399.0
 vicuna-13b-v1.3,82.11180124223603,1.348769957803504,660,143,2,805,verified,1132.0
 gpt35_turbo_instruct,81.7103620474407,1.3306133328057392,642,134,25,801,community,1018.0
 openbuddy-llama-30b-v7.1,81.54613466334165,1.370658000946423,654,148,0,802,community,968.0

diff --git a/src/alpaca_eval/models_configs/ultralm-13b-best-of-16/configs.yaml b/src/alpaca_eval/models_configs/ultralm-13b-best-of-16/configs.yaml
@@ -0,0 +1,9 @@
+ultralm-13b:
+  prompt_template: "ultralm-13b-best-of-16/prompt.txt"
+  pretty_name: "UltraLM 13B (best-of-16)"
+  link: 
+    - "https://github.com/thunlp/UltraChat"
+    - "https://github.com/thunlp/UltraFeedback"
+    - "https://huggingface.co/openbmb/UltraRM-13b"
+  # Results cannot be directly reproduced with alpaca_eval official `fn_completions` because they require best-of-n sampling.
+  # The reproduction requires generaing 16 completions using vllm at inference time and then using a reward model, UltraRM, to seelct the one with the highest reward.
diff --git a/src/alpaca_eval/models_configs/ultralm-13b-best-of-16/prompt.txt b/src/alpaca_eval/models_configs/ultralm-13b-best-of-16/prompt.txt
@@ -0,0 +1,3 @@
+User: You are now a helpful assistant, your answer should focus the question give reasonable solutions, self-consistent, of high-quality to make users satisfied. Please try to make the answer more informative, detailed, and polite.</s>
+User: {instruction}</s>
+Assistant: 
diff --git a/src/alpaca_eval/models_configs/ultralm-13b-v2.0-best-of-16/configs.yaml b/src/alpaca_eval/models_configs/ultralm-13b-v2.0-best-of-16/configs.yaml
@@ -0,0 +1,9 @@
+ultralm-13b:
+  prompt_template: "ultralm-13b-v2.0-best-of-16/prompt.txt"
+  pretty_name: "UltraLM 13B V2.0 (best-of-16)"
+  link: 
+    - "https://github.com/thunlp/UltraChat"
+    - "https://github.com/thunlp/UltraFeedback"
+    - "https://huggingface.co/openbmb/UltraRM-13b"
+  # Results cannot be directly reproduced with alpaca_eval official `fn_completions` because they require best-of-n sampling.
+  # The reproduction requires generaing 16 completions using vllm at inference time and then using a reward model, UltraRM, to seelct the one with the highest reward.
diff --git a/src/alpaca_eval/models_configs/ultralm-13b-v2.0-best-of-16/prompt.txt b/src/alpaca_eval/models_configs/ultralm-13b-v2.0-best-of-16/prompt.txt
@@ -0,0 +1,3 @@
+User: You are now a helpful assistant, your answer should focus the question give reasonable solutions, self-consistent, of high-quality to make users satisfied. Please try to make the answer more informative, detailed, and polite.</s>
+User: {instruction}</s>
+Assistant: 
diff --git a/src/alpaca_eval/models_configs/ultralm-13b-v2.0/configs.yaml b/src/alpaca_eval/models_configs/ultralm-13b-v2.0/configs.yaml
@@ -0,0 +1,14 @@
+ultralm-13b:
+  prompt_template: "ultralm-13b-v2.0/prompt.txt"
+  fn_completions: "huggingface_local_completions"
+  completions_kwargs:
+    model_name: "./ultralm-13b-v2.0" # local path
+    model_kwargs:
+      torch_dtype: 'float16'
+    max_new_tokens: 2048
+    temperature: 0.7
+    top_p: 1.0
+    do_sample: True
+    early_stopping: True
+  pretty_name: "UltraLM 13B V2.0"
+  link: "https://github.com/thunlp/UltraChat"
diff --git a/src/alpaca_eval/models_configs/ultralm-13b-v2.0/prompt.txt b/src/alpaca_eval/models_configs/ultralm-13b-v2.0/prompt.txt
@@ -0,0 +1,3 @@
+User: You are now a helpful assistant, your answer should focus the question give reasonable solutions, self-consistent, of high-quality to make users satisfied. Please try to make the answer more informative, detailed, and polite.</s>
+User: {instruction}</s>
+Assistant: