From 57294e619222f432899b03298da4110dc0d4d1cd Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 2 Oct 2023 19:53:13 +0000 Subject: [PATCH 1/8] update dates --- fastchat/serve/monitor/clean_battle_data.py | 4 ++-- fastchat/serve/monitor/clean_chat_data.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fastchat/serve/monitor/clean_battle_data.py b/fastchat/serve/monitor/clean_battle_data.py index 4cab1af42..6b6f274dc 100644 --- a/fastchat/serve/monitor/clean_battle_data.py +++ b/fastchat/serve/monitor/clean_battle_data.py @@ -44,8 +44,8 @@ def get_log_files(max_num_files=None): dates = [] - for month in [4, 5, 6, 7, 8, 9]: - for day in range(1, 32): + for month in range(4, 12): + for day in range(1, 33): dates.append(f"2023-{month:02d}-{day:02d}") filenames = [] diff --git a/fastchat/serve/monitor/clean_chat_data.py b/fastchat/serve/monitor/clean_chat_data.py index 86d15bac2..7f0c9bd4f 100644 --- a/fastchat/serve/monitor/clean_chat_data.py +++ b/fastchat/serve/monitor/clean_chat_data.py @@ -28,8 +28,8 @@ def get_log_files(max_num_files=None): dates = [] - for month in [4, 5, 6, 7, 8, 9, 10]: - for day in range(1, 32): + for month in range(4, 12): + for day in range(1, 33): dates.append(f"2023-{month:02d}-{day:02d}") filenames = [] From 17bc82bad5152ae14e66efb614f26acfc776486d Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 2 Oct 2023 20:19:55 +0000 Subject: [PATCH 2/8] update commands --- docs/commands/leaderboard.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/commands/leaderboard.md b/docs/commands/leaderboard.md index 0a668f649..04477a095 100644 --- a/docs/commands/leaderboard.md +++ b/docs/commands/leaderboard.md @@ -24,3 +24,14 @@ scp atlas:/data/lmzheng/FastChat/fastchat/serve/monitor/elo_results_20230905.pkl ``` wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/raw/main/leaderboard_table_20230905.csv ``` + +### Update files on webserver +``` +DATE=20231002 + +rm -rf elo_results.pkl leaderboard_table.csv +wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/resolve/main/elo_results_$DATE.pkl +wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/resolve/main/leaderboard_table_$DATE.csv +ln -s leaderboard_table_$DATE.csv leaderboard_table.csv +ln -s elo_results_$DATE.pkl elo_results.pkl +``` From d78ac6309e57e99aa7999edec3f9f0f8b4c9c1b4 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 2 Oct 2023 21:02:28 +0000 Subject: [PATCH 3/8] Update figures --- fastchat/serve/monitor/elo_analysis.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py index e96007a9d..9d11908d0 100644 --- a/fastchat/serve/monitor/elo_analysis.py +++ b/fastchat/serve/monitor/elo_analysis.py @@ -225,6 +225,8 @@ def report_elo_analysis_results(battles_json): model_order = list(elo_rating_median.keys()) model_order.sort(key=lambda k: -elo_rating_median[k]) + model_order = model_order[:25] + # Plots leaderboard_table = visualize_leaderboard_table(elo_rating_median) win_fraction_heatmap = visualize_pairwise_win_fraction(battles_no_ties, model_order) From ee2a0030851f0fa80ecae19776f26413e051a247 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 2 Oct 2023 21:25:07 +0000 Subject: [PATCH 4/8] limit show number --- fastchat/serve/monitor/elo_analysis.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py index 9d11908d0..e95f157c8 100644 --- a/fastchat/serve/monitor/elo_analysis.py +++ b/fastchat/serve/monitor/elo_analysis.py @@ -58,7 +58,7 @@ def get_median_elo_from_bootstrap(bootstrap_df): return median -def compute_pairwise_win_fraction(battles, model_order): +def compute_pairwise_win_fraction(battles, model_order, limit_show_number=None): # Times each model wins as Model A a_win_ptbl = pd.pivot_table( battles[battles["winner"] == "model_a"], @@ -92,6 +92,9 @@ def compute_pairwise_win_fraction(battles, model_order): prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False) model_order = list(prop_wins.keys()) + if limit_show_number is not None: + model_order = model_order[:limit_show_number] + # Arrange ordering according to proprition of wins row_beats_col = row_beats_col_freq.loc[model_order, model_order] return row_beats_col @@ -166,8 +169,10 @@ def visualize_battle_count(battles, model_order): return fig -def visualize_average_win_rate(battles): - row_beats_col_freq = compute_pairwise_win_fraction(battles, None) +def visualize_average_win_rate(battles, limit_show_number): + row_beats_col_freq = compute_pairwise_win_fraction( + battles, None, limit_show_number=limit_show_number + ) fig = px.bar( row_beats_col_freq.mean(axis=1).sort_values(ascending=False), text_auto=".2f", @@ -180,7 +185,7 @@ def visualize_average_win_rate(battles): return fig -def visualize_bootstrap_elo_rating(df): +def visualize_bootstrap_elo_rating(df, limit_show_number): bars = ( pd.DataFrame( dict( @@ -192,6 +197,7 @@ def visualize_bootstrap_elo_rating(df): .reset_index(names="model") .sort_values("rating", ascending=False) ) + bars = bars[:limit_show_number] bars["error_y"] = bars["upper"] - bars["rating"] bars["error_y_minus"] = bars["rating"] - bars["lower"] bars["rating_rounded"] = np.round(bars["rating"], 2) @@ -225,14 +231,19 @@ def report_elo_analysis_results(battles_json): model_order = list(elo_rating_median.keys()) model_order.sort(key=lambda k: -elo_rating_median[k]) - model_order = model_order[:25] + limit_show_number = 25 # limit show number to make plots smaller + model_order = model_order[:limit_show_number] # Plots leaderboard_table = visualize_leaderboard_table(elo_rating_median) win_fraction_heatmap = visualize_pairwise_win_fraction(battles_no_ties, model_order) battle_count_heatmap = visualize_battle_count(battles_no_ties, model_order) - average_win_rate_bar = visualize_average_win_rate(battles_no_ties) - bootstrap_elo_rating = visualize_bootstrap_elo_rating(bootstrap_df) + average_win_rate_bar = visualize_average_win_rate( + battles_no_ties, limit_show_number + ) + bootstrap_elo_rating = visualize_bootstrap_elo_rating( + bootstrap_df, limit_show_number + ) last_updated_tstamp = battles["tstamp"].max() last_updated_datetime = datetime.datetime.fromtimestamp( From 46e85003f27c1343be676d7e10c8c116870af785 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 2 Oct 2023 21:30:22 +0000 Subject: [PATCH 5/8] fix --- fastchat/serve/monitor/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 3cfa84f1f..c7dc1414d 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -39,7 +39,7 @@ def make_leaderboard_md(elo_results): - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks. -💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023. +💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: October, 2023. """ return leaderboard_md From ed25c414d64173060f54f23fa33f97a400f38cfc Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 2 Oct 2023 21:37:59 +0000 Subject: [PATCH 6/8] fix docs --- fastchat/serve/huggingface_api_worker.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fastchat/serve/huggingface_api_worker.py b/fastchat/serve/huggingface_api_worker.py index 29ddaa40c..e745333fb 100644 --- a/fastchat/serve/huggingface_api_worker.py +++ b/fastchat/serve/huggingface_api_worker.py @@ -1,18 +1,19 @@ """ -A model worker to call huggingface api. -JSON file format: +A model worker that calls huggingface inference endpoint. + +Register models in a JSON file with the following format: { "falcon-180b-chat": { "model_path": "tiiuae/falcon-180B-chat", "api_base": "https://api-inference.huggingface.co/models", "token": "hf_xxx", - "context_length": 2048 + "context_length": 2048, "model_names": "falcon-180b-chat", - "conv_template": null, + "conv_template": null } } -Only "model_path", "api_base", and "token" are necessary, others are optional. +"model_path", "api_base", "token", and "context_length" are necessary, while others are optional. """ import argparse import asyncio @@ -312,7 +313,7 @@ def create_huggingface_api_worker(): api_base_list.append(model_info[m]["api_base"]) token_list.append(model_info[m]["token"]) - context_length = model_info[m].get("context_length", 1024) + context_length = model_info[m]["context_length"] model_names = model_info[m].get("model_names", [m.split("/")[-1]]) if isinstance(model_names, str): model_names = [model_names] From 0ca4b5f3325f93ebfc7e49b23ca239dd585acbd1 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 2 Oct 2023 22:02:10 +0000 Subject: [PATCH 7/8] update --- fastchat/serve/huggingface_api_worker.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fastchat/serve/huggingface_api_worker.py b/fastchat/serve/huggingface_api_worker.py index e745333fb..a356273d9 100644 --- a/fastchat/serve/huggingface_api_worker.py +++ b/fastchat/serve/huggingface_api_worker.py @@ -117,6 +117,9 @@ def __init__( f"Connecting with huggingface api {self.model_path} as {self.model_names} on worker {worker_id} ..." ) + if not no_register: + self.init_heart_beat() + def count_token(self, params): # No tokenizer here ret = { From ff77d44b995b58de61219bd320ce6153fd21d47d Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 2 Oct 2023 22:03:14 +0000 Subject: [PATCH 8/8] update --- fastchat/serve/monitor/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index c7dc1414d..c465df314 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -35,7 +35,7 @@ def make_leaderboard_md(elo_results): | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | 🏆 This leaderboard is based on the following three benchmarks. -- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 70K+ user votes to compute Elo ratings. +- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 90K+ user votes to compute Elo ratings. - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.