From 7a912c70dd08bf5759d1e2d5af9aac6781fea263 Mon Sep 17 00:00:00 2001 From: Aditya Mittal Date: Thu, 19 Dec 2024 13:59:21 -0500 Subject: [PATCH] Add Copilot Arena leaderboard (#3618) Co-authored-by: Wayne Chi --- fastchat/serve/monitor/copilot_arena.py | 96 +++++++++++++++++++++++++ fastchat/serve/monitor/monitor.py | 9 +++ 2 files changed, 105 insertions(+) create mode 100644 fastchat/serve/monitor/copilot_arena.py diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py new file mode 100644 index 000000000..80786e341 --- /dev/null +++ b/fastchat/serve/monitor/copilot_arena.py @@ -0,0 +1,96 @@ +import gradio as gr +import pandas as pd +import requests +import os + +from fastchat.serve.monitor.monitor import recompute_final_ranking + +copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL") + +if not copilot_arena_leaderboard_url: + raise ValueError( + "COPILOT_ARENA_LEADERBOARD_URL environment variable is not set. " + "Please configure it to a valid URL." + ) + + +def process_copilot_arena_leaderboard(leaderboard): + leaderboard = leaderboard.copy().loc[leaderboard["visibility"] == "public"] + leaderboard["score"] = leaderboard["score"].round().astype(int) + leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int) + leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int) + + leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"] + leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"] + + leaderboard["confidence_interval"] = ( + "+" + + leaderboard["upper_diff"].astype(str) + + " / -" + + leaderboard["lower_diff"].astype(str) + ) + + rankings_ub = recompute_final_ranking(leaderboard) + leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub) + + leaderboard = leaderboard.sort_values( + by=["Rank* (UB)", "score"], ascending=[True, False] + ) + + return leaderboard + + +def build_copilot_arena_tab(): + if copilot_arena_leaderboard_url is None: + print("Copilot Arena Leaderboard URL is not set. Skipping this leaderboard.") + return + response = requests.get(copilot_arena_leaderboard_url) + if response.status_code == 200: + leaderboard = pd.DataFrame(response.json()["elo_data"]) + leaderboard = process_copilot_arena_leaderboard(leaderboard) + leaderboard = leaderboard.rename( + columns={ + "name": "Model", + "confidence_interval": "Confidence Interval", + "score": "Arena Score", + "organization": "Organization", + "votes": "Votes", + } + ) + + column_order = [ + "Rank* (UB)", + "Model", + "Arena Score", + "Confidence Interval", + "Votes", + "Organization", + ] + leaderboard = leaderboard[column_order] + num_models = len(leaderboard) + total_battles = int(leaderboard["Votes"].sum()) // 2 + md = f""" + [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. + """ + + gr.Markdown(md, elem_id="leaderboard_markdown") + gr.DataFrame( + leaderboard, + datatype=["str" for _ in leaderboard.columns], + elem_id="arena_hard_leaderboard", + height=600, + wrap=True, + interactive=False, + column_widths=[70, 130, 60, 80, 50, 80], + ) + + gr.Markdown( + """ + ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. + Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n + **Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound. + """, + elem_id="leaderboard_markdown", + ) + else: + gr.Markdown("Error with fetching Copilot Arena data. Check back in later.") diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index c07ee4669..9f7e0dd08 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -1034,6 +1034,15 @@ def build_leaderboard_tab( build_full_leaderboard_tab( elo_results_text, model_table_df, model_to_score ) + try: + with gr.Tab("Copilot Arena Leaderboard", id=5): + from fastchat.serve.monitor.copilot_arena import ( + build_copilot_arena_tab, + ) + + build_copilot_arena_tab() + except Exception as e: + print(f"Unable to build Copilot Arena's Leaderboard. Error: {e}") if not show_plot: gr.Markdown(