From 86e85261b7b5ac9482ebc208f5ee055670377120 Mon Sep 17 00:00:00 2001 From: Aditya Date: Wed, 13 Nov 2024 12:08:34 -0500 Subject: [PATCH 01/10] initial commit --- fastchat/serve/monitor/monitor.py | 103 +++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 683a40b54..fd3816fb3 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -17,6 +17,7 @@ import pandas as pd import gradio as gr import numpy as np +import requests from fastchat.constants import SURVEY_LINK from fastchat.serve.monitor.basic_stats import report_basic_stats, get_log_files @@ -855,6 +856,105 @@ def build_category_leaderboard_tab( sort_rating, inputs=[rating_button], outputs=[overall_ranking_leaderboard] ) +def compute_ub_ranking(arena_df): + # Sort models based on their scores + sorted_models = arena_df.sort_values('score', ascending=False).index.tolist() + + ub_ranking = {} + current_rank = 1 + i = 0 + + while i < len(sorted_models): + current_model = sorted_models[i] + current_lower = arena_df.loc[current_model]['lower'] + tied_models = [current_model] + + # Find ties + j = i + 1 + while j < len(sorted_models): + next_model = sorted_models[j] + if arena_df.loc[next_model]['upper'] >= current_lower: + tied_models.append(next_model) + j += 1 + else: + break + + # Assign ranks to tied models + for model in tied_models: + ub_ranking[model] = current_rank + + # Move to the next unprocessed model + i = j + # Next rank is at least the position in the sorted list + current_rank = max(current_rank + 1, i + 1) + + return ub_ranking + +def process_copilot_arena_leaderboard(leaderboard): + leaderboard['score'] = leaderboard['score'].round().astype(int) + leaderboard['upper'] = leaderboard['upper'].round().astype(int) + leaderboard['lower'] = leaderboard['lower'].round().astype(int) + + leaderboard['upper_diff'] = leaderboard['upper'] - leaderboard['score'] + leaderboard['lower_diff'] = leaderboard['score'] - leaderboard['lower'] + + leaderboard['confidence_interval'] = '+' + leaderboard['upper_diff'].astype(str) + ' / -' + leaderboard['lower_diff'].astype(str) + + rankings_ub = compute_ub_ranking(leaderboard) + leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub) + leaderboard['Rank'] = leaderboard['score'].rank(ascending=False).astype(int) + + leaderboard = leaderboard.sort_values(by=['Rank'], ascending=[True]) + + return leaderboard + +def build_copilot_arena_tab(): + copilot_arena_leaderboard_url = "https://leaderboard-server.fly.dev/elo" + response = requests.get(copilot_arena_leaderboard_url) + if response.status_code == 200: + leaderboard = pd.DataFrame(response.json()["elo_data"]) + leaderboard = process_copilot_arena_leaderboard(leaderboard) + leaderboard = leaderboard.rename( + columns= { + "name": "Model", + "confidence_interval": "Confidence Interval", + "score": "Arena Score", + "organization": "Organization", + "votes": "Votes", + } + ) + + column_order = ["Rank* (UB)", "Model", "Arena Score", "Confidence Interval", "Votes", "Organization"] + leaderboard = leaderboard[column_order] + num_models = len(leaderboard) + total_battles = int(leaderboard['Votes'].sum())//2 + md = f"This is the leaderboard of all {num_models} models, and their relative performance in Copilot Arena. There are currently a total of {total_battles} battles." + + gr.Markdown(md, elem_id="leaderboard_markdown") + gr.DataFrame( + leaderboard, + datatype=[ + "str" + for _ in leaderboard.columns + ], + elem_id="arena_hard_leaderboard", + height=600, + wrap=True, + interactive=False, + column_widths=[70, 130, 60, 80, 50, 80], + ) + + gr.Markdown( + """ + ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. + Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n + **Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound. + """, + elem_id="leaderboard_markdown", + ) + else: + gr.Markdown("Error with fetching Copilot Arena data. Check back in later.") + selected_categories = [ "full", @@ -917,7 +1017,6 @@ def get_model_name(model_key): combined_table = combined_table.dropna() return combined_table - def build_leaderboard_tab( elo_results_file, leaderboard_table_file, @@ -1052,6 +1151,8 @@ def build_leaderboard_tab( build_full_leaderboard_tab( elo_results_text, model_table_df, model_to_score ) + with gr.Tab("Copilot Arena Leaderboard", id=5): + build_copilot_arena_tab() if not show_plot: gr.Markdown( From da77661eef21c5c870e2f81969f370215a704470 Mon Sep 17 00:00:00 2001 From: Aditya Date: Wed, 13 Nov 2024 19:05:46 -0500 Subject: [PATCH 02/10] update text --- fastchat/serve/monitor/monitor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index fd3816fb3..f886978a0 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -928,7 +928,9 @@ def build_copilot_arena_tab(): leaderboard = leaderboard[column_order] num_models = len(leaderboard) total_battles = int(leaderboard['Votes'].sum())//2 - md = f"This is the leaderboard of all {num_models} models, and their relative performance in Copilot Arena. There are currently a total of {total_battles} battles." + md = f""" + Copilot Arena is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. + """ gr.Markdown(md, elem_id="leaderboard_markdown") gr.DataFrame( From 518395b1346e5c8dc0967e9abd7b653b08710eb2 Mon Sep 17 00:00:00 2001 From: Aditya Date: Wed, 13 Nov 2024 19:17:26 -0500 Subject: [PATCH 03/10] add hyperlink --- fastchat/serve/monitor/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index f886978a0..1aebca07e 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -929,7 +929,7 @@ def build_copilot_arena_tab(): num_models = len(leaderboard) total_battles = int(leaderboard['Votes'].sum())//2 md = f""" - Copilot Arena is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. + [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. """ gr.Markdown(md, elem_id="leaderboard_markdown") From 72737838f76a1188a00c82b1387c6a160e1f7ec7 Mon Sep 17 00:00:00 2001 From: Aditya Date: Thu, 14 Nov 2024 16:03:15 -0500 Subject: [PATCH 04/10] run formatter --- fastchat/serve/monitor/monitor.py | 71 ++++++++++++++++++------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 1aebca07e..7209c1391 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -856,57 +856,65 @@ def build_category_leaderboard_tab( sort_rating, inputs=[rating_button], outputs=[overall_ranking_leaderboard] ) + def compute_ub_ranking(arena_df): # Sort models based on their scores - sorted_models = arena_df.sort_values('score', ascending=False).index.tolist() - + sorted_models = arena_df.sort_values("score", ascending=False).index.tolist() + ub_ranking = {} current_rank = 1 i = 0 - + while i < len(sorted_models): current_model = sorted_models[i] - current_lower = arena_df.loc[current_model]['lower'] + current_lower = arena_df.loc[current_model]["lower"] tied_models = [current_model] - + # Find ties j = i + 1 while j < len(sorted_models): next_model = sorted_models[j] - if arena_df.loc[next_model]['upper'] >= current_lower: + if arena_df.loc[next_model]["upper"] >= current_lower: tied_models.append(next_model) j += 1 else: break - + # Assign ranks to tied models for model in tied_models: ub_ranking[model] = current_rank - + # Move to the next unprocessed model i = j # Next rank is at least the position in the sorted list current_rank = max(current_rank + 1, i + 1) - - return ub_ranking -def process_copilot_arena_leaderboard(leaderboard): - leaderboard['score'] = leaderboard['score'].round().astype(int) - leaderboard['upper'] = leaderboard['upper'].round().astype(int) - leaderboard['lower'] = leaderboard['lower'].round().astype(int) + return ub_ranking - leaderboard['upper_diff'] = leaderboard['upper'] - leaderboard['score'] - leaderboard['lower_diff'] = leaderboard['score'] - leaderboard['lower'] - leaderboard['confidence_interval'] = '+' + leaderboard['upper_diff'].astype(str) + ' / -' + leaderboard['lower_diff'].astype(str) +def process_copilot_arena_leaderboard(leaderboard): + leaderboard["score"] = leaderboard["score"].round().astype(int) + leaderboard["upper"] = leaderboard["upper"].round().astype(int) + leaderboard["lower"] = leaderboard["lower"].round().astype(int) + + leaderboard["upper_diff"] = leaderboard["upper"] - leaderboard["score"] + leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["lower"] + + leaderboard["confidence_interval"] = ( + "+" + + leaderboard["upper_diff"].astype(str) + + " / -" + + leaderboard["lower_diff"].astype(str) + ) rankings_ub = compute_ub_ranking(leaderboard) leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub) - leaderboard['Rank'] = leaderboard['score'].rank(ascending=False).astype(int) + leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int) + + leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True]) + + return leaderboard - leaderboard = leaderboard.sort_values(by=['Rank'], ascending=[True]) - - return leaderboard def build_copilot_arena_tab(): copilot_arena_leaderboard_url = "https://leaderboard-server.fly.dev/elo" @@ -915,7 +923,7 @@ def build_copilot_arena_tab(): leaderboard = pd.DataFrame(response.json()["elo_data"]) leaderboard = process_copilot_arena_leaderboard(leaderboard) leaderboard = leaderboard.rename( - columns= { + columns={ "name": "Model", "confidence_interval": "Confidence Interval", "score": "Arena Score", @@ -924,10 +932,17 @@ def build_copilot_arena_tab(): } ) - column_order = ["Rank* (UB)", "Model", "Arena Score", "Confidence Interval", "Votes", "Organization"] + column_order = [ + "Rank* (UB)", + "Model", + "Arena Score", + "Confidence Interval", + "Votes", + "Organization", + ] leaderboard = leaderboard[column_order] - num_models = len(leaderboard) - total_battles = int(leaderboard['Votes'].sum())//2 + num_models = len(leaderboard) + total_battles = int(leaderboard["Votes"].sum()) // 2 md = f""" [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. """ @@ -935,10 +950,7 @@ def build_copilot_arena_tab(): gr.Markdown(md, elem_id="leaderboard_markdown") gr.DataFrame( leaderboard, - datatype=[ - "str" - for _ in leaderboard.columns - ], + datatype=["str" for _ in leaderboard.columns], elem_id="arena_hard_leaderboard", height=600, wrap=True, @@ -1019,6 +1031,7 @@ def get_model_name(model_key): combined_table = combined_table.dropna() return combined_table + def build_leaderboard_tab( elo_results_file, leaderboard_table_file, From ca73dbe80d8e85df65a9c6849daa120354fad7ec Mon Sep 17 00:00:00 2001 From: Aditya Date: Fri, 15 Nov 2024 17:12:19 -0500 Subject: [PATCH 05/10] fix pr comments --- fastchat/serve/monitor/copilot_arena.py | 85 +++++++++++++++++ fastchat/serve/monitor/monitor.py | 116 +----------------------- 2 files changed, 87 insertions(+), 114 deletions(-) create mode 100644 fastchat/serve/monitor/copilot_arena.py diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py new file mode 100644 index 000000000..605090be6 --- /dev/null +++ b/fastchat/serve/monitor/copilot_arena.py @@ -0,0 +1,85 @@ +import gradio as gr +import pandas as pd +import requests + +from fastchat.serve.monitor.monitor import recompute_final_ranking + +copilot_arena_leaderboard_url = ( + "https://leaderboard-server.fly.dev/elo" +) + +def process_copilot_arena_leaderboard(leaderboard): + leaderboard["score"] = leaderboard["score"].round().astype(int) + leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int) + leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int) + + leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"] + leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"] + + leaderboard["confidence_interval"] = ( + "+" + + leaderboard["upper_diff"].astype(str) + + " / -" + + leaderboard["lower_diff"].astype(str) + ) + + rankings_ub = recompute_final_ranking(leaderboard) + leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub) + leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int) + + leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True]) + + return leaderboard + + +def build_copilot_arena_tab(): + response = requests.get(copilot_arena_leaderboard_url) + if response.status_code == 200: + leaderboard = pd.DataFrame(response.json()["elo_data"]) + leaderboard = process_copilot_arena_leaderboard(leaderboard) + leaderboard = leaderboard.rename( + columns={ + "name": "Model", + "confidence_interval": "Confidence Interval", + "score": "Arena Score", + "organization": "Organization", + "votes": "Votes", + } + ) + + column_order = [ + "Rank* (UB)", + "Model", + "Arena Score", + "Confidence Interval", + "Votes", + "Organization", + ] + leaderboard = leaderboard[column_order] + num_models = len(leaderboard) + total_battles = int(leaderboard["Votes"].sum()) // 2 + md = f""" + [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. + """ + + gr.Markdown(md, elem_id="leaderboard_markdown") + gr.DataFrame( + leaderboard, + datatype=["str" for _ in leaderboard.columns], + elem_id="arena_hard_leaderboard", + height=600, + wrap=True, + interactive=False, + column_widths=[70, 130, 60, 80, 50, 80], + ) + + gr.Markdown( + """ + ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. + Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n + **Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound. + """, + elem_id="leaderboard_markdown", + ) + else: + gr.Markdown("Error with fetching Copilot Arena data. Check back in later.") diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 7209c1391..45501678c 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -17,7 +17,6 @@ import pandas as pd import gradio as gr import numpy as np -import requests from fastchat.constants import SURVEY_LINK from fastchat.serve.monitor.basic_stats import report_basic_stats, get_log_files @@ -857,119 +856,6 @@ def build_category_leaderboard_tab( ) -def compute_ub_ranking(arena_df): - # Sort models based on their scores - sorted_models = arena_df.sort_values("score", ascending=False).index.tolist() - - ub_ranking = {} - current_rank = 1 - i = 0 - - while i < len(sorted_models): - current_model = sorted_models[i] - current_lower = arena_df.loc[current_model]["lower"] - tied_models = [current_model] - - # Find ties - j = i + 1 - while j < len(sorted_models): - next_model = sorted_models[j] - if arena_df.loc[next_model]["upper"] >= current_lower: - tied_models.append(next_model) - j += 1 - else: - break - - # Assign ranks to tied models - for model in tied_models: - ub_ranking[model] = current_rank - - # Move to the next unprocessed model - i = j - # Next rank is at least the position in the sorted list - current_rank = max(current_rank + 1, i + 1) - - return ub_ranking - - -def process_copilot_arena_leaderboard(leaderboard): - leaderboard["score"] = leaderboard["score"].round().astype(int) - leaderboard["upper"] = leaderboard["upper"].round().astype(int) - leaderboard["lower"] = leaderboard["lower"].round().astype(int) - - leaderboard["upper_diff"] = leaderboard["upper"] - leaderboard["score"] - leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["lower"] - - leaderboard["confidence_interval"] = ( - "+" - + leaderboard["upper_diff"].astype(str) - + " / -" - + leaderboard["lower_diff"].astype(str) - ) - - rankings_ub = compute_ub_ranking(leaderboard) - leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub) - leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int) - - leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True]) - - return leaderboard - - -def build_copilot_arena_tab(): - copilot_arena_leaderboard_url = "https://leaderboard-server.fly.dev/elo" - response = requests.get(copilot_arena_leaderboard_url) - if response.status_code == 200: - leaderboard = pd.DataFrame(response.json()["elo_data"]) - leaderboard = process_copilot_arena_leaderboard(leaderboard) - leaderboard = leaderboard.rename( - columns={ - "name": "Model", - "confidence_interval": "Confidence Interval", - "score": "Arena Score", - "organization": "Organization", - "votes": "Votes", - } - ) - - column_order = [ - "Rank* (UB)", - "Model", - "Arena Score", - "Confidence Interval", - "Votes", - "Organization", - ] - leaderboard = leaderboard[column_order] - num_models = len(leaderboard) - total_battles = int(leaderboard["Votes"].sum()) // 2 - md = f""" - [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. - """ - - gr.Markdown(md, elem_id="leaderboard_markdown") - gr.DataFrame( - leaderboard, - datatype=["str" for _ in leaderboard.columns], - elem_id="arena_hard_leaderboard", - height=600, - wrap=True, - interactive=False, - column_widths=[70, 130, 60, 80, 50, 80], - ) - - gr.Markdown( - """ - ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. - Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n - **Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound. - """, - elem_id="leaderboard_markdown", - ) - else: - gr.Markdown("Error with fetching Copilot Arena data. Check back in later.") - - selected_categories = [ "full", "full_style_control", @@ -1167,6 +1053,8 @@ def build_leaderboard_tab( elo_results_text, model_table_df, model_to_score ) with gr.Tab("Copilot Arena Leaderboard", id=5): + from fastchat.serve.monitor.copilot_arena import build_copilot_arena_tab + build_copilot_arena_tab() if not show_plot: From 04665fa66a6437552843ddaed686ed2c1d08c3f1 Mon Sep 17 00:00:00 2001 From: Aditya Date: Fri, 15 Nov 2024 17:16:40 -0500 Subject: [PATCH 06/10] fix lint --- fastchat/serve/monitor/copilot_arena.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py index 605090be6..e7a860319 100644 --- a/fastchat/serve/monitor/copilot_arena.py +++ b/fastchat/serve/monitor/copilot_arena.py @@ -4,9 +4,8 @@ from fastchat.serve.monitor.monitor import recompute_final_ranking -copilot_arena_leaderboard_url = ( - "https://leaderboard-server.fly.dev/elo" -) +copilot_arena_leaderboard_url = "https://leaderboard-server.fly.dev/elo" + def process_copilot_arena_leaderboard(leaderboard): leaderboard["score"] = leaderboard["score"].round().astype(int) From 0ab2c43deab8e8b0eb91725b472c0e89f8350ab4 Mon Sep 17 00:00:00 2001 From: Aditya Date: Fri, 15 Nov 2024 17:34:50 -0500 Subject: [PATCH 07/10] sort by UB --- fastchat/serve/monitor/copilot_arena.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py index e7a860319..cb6c2d2e5 100644 --- a/fastchat/serve/monitor/copilot_arena.py +++ b/fastchat/serve/monitor/copilot_arena.py @@ -24,9 +24,10 @@ def process_copilot_arena_leaderboard(leaderboard): rankings_ub = recompute_final_ranking(leaderboard) leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub) - leaderboard["Rank"] = leaderboard["score"].rank(ascending=False).astype(int) - leaderboard = leaderboard.sort_values(by=["Rank"], ascending=[True]) + leaderboard = leaderboard.sort_values( + by=["Rank* (UB)", "score"], ascending=[True, False] + ) return leaderboard From 30b7f3770ae723f7c2a3480f866475692833e4ed Mon Sep 17 00:00:00 2001 From: Aditya Date: Sat, 16 Nov 2024 12:31:37 -0500 Subject: [PATCH 08/10] add visibility check --- fastchat/serve/monitor/copilot_arena.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py index cb6c2d2e5..1abf0bc35 100644 --- a/fastchat/serve/monitor/copilot_arena.py +++ b/fastchat/serve/monitor/copilot_arena.py @@ -8,6 +8,7 @@ def process_copilot_arena_leaderboard(leaderboard): + leaderboard = leaderboard.copy().loc[leaderboard["visibility"] == "public"] leaderboard["score"] = leaderboard["score"].round().astype(int) leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int) leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int) From 36f8591559ac2e387e1b25068a774fda5166b078 Mon Sep 17 00:00:00 2001 From: Aditya Date: Mon, 18 Nov 2024 12:27:26 -0500 Subject: [PATCH 09/10] change to load from env --- fastchat/serve/monitor/copilot_arena.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py index 1abf0bc35..a5047eda1 100644 --- a/fastchat/serve/monitor/copilot_arena.py +++ b/fastchat/serve/monitor/copilot_arena.py @@ -1,10 +1,17 @@ import gradio as gr import pandas as pd import requests +import os from fastchat.serve.monitor.monitor import recompute_final_ranking -copilot_arena_leaderboard_url = "https://leaderboard-server.fly.dev/elo" +copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL") + +if not copilot_arena_leaderboard_url: + raise ValueError( + "COPILOT_ARENA_LEADERBOARD_URL environment variable is not set. " + "Please configure it to a valid URL." + ) def process_copilot_arena_leaderboard(leaderboard): From 59b3058a4dffb20989ce878550c0d0f21aa39981 Mon Sep 17 00:00:00 2001 From: Wayne Chi Date: Wed, 18 Dec 2024 20:47:56 -0500 Subject: [PATCH 10/10] Handling situations where Copilot Arena's Leaderboard is not set / unable to be built --- fastchat/serve/monitor/copilot_arena.py | 3 +++ fastchat/serve/monitor/monitor.py | 11 ++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py index a5047eda1..80786e341 100644 --- a/fastchat/serve/monitor/copilot_arena.py +++ b/fastchat/serve/monitor/copilot_arena.py @@ -41,6 +41,9 @@ def process_copilot_arena_leaderboard(leaderboard): def build_copilot_arena_tab(): + if copilot_arena_leaderboard_url is None: + print("Copilot Arena Leaderboard URL is not set. Skipping this leaderboard.") + return response = requests.get(copilot_arena_leaderboard_url) if response.status_code == 200: leaderboard = pd.DataFrame(response.json()["elo_data"]) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 45501678c..1abe29712 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -1052,10 +1052,15 @@ def build_leaderboard_tab( build_full_leaderboard_tab( elo_results_text, model_table_df, model_to_score ) - with gr.Tab("Copilot Arena Leaderboard", id=5): - from fastchat.serve.monitor.copilot_arena import build_copilot_arena_tab + try: + with gr.Tab("Copilot Arena Leaderboard", id=5): + from fastchat.serve.monitor.copilot_arena import ( + build_copilot_arena_tab, + ) - build_copilot_arena_tab() + build_copilot_arena_tab() + except Exception as e: + print(f"Unable to build Copilot Arena's Leaderboard. Error: {e}") if not show_plot: gr.Markdown(