From 7a912c70dd08bf5759d1e2d5af9aac6781fea263 Mon Sep 17 00:00:00 2001
From: Aditya Mittal <adityamittal307@gmail.com>
Date: Thu, 19 Dec 2024 13:59:21 -0500
Subject: [PATCH] Add Copilot Arena leaderboard (#3618)

Co-authored-by: Wayne Chi <waynechi@andrew.cmu.edu>
---
 fastchat/serve/monitor/copilot_arena.py | 96 +++++++++++++++++++++++++
 fastchat/serve/monitor/monitor.py       |  9 +++
 2 files changed, 105 insertions(+)
 create mode 100644 fastchat/serve/monitor/copilot_arena.py

diff --git a/fastchat/serve/monitor/copilot_arena.py b/fastchat/serve/monitor/copilot_arena.py
new file mode 100644
index 000000000..80786e341
--- /dev/null
+++ b/fastchat/serve/monitor/copilot_arena.py
@@ -0,0 +1,96 @@
+import gradio as gr
+import pandas as pd
+import requests
+import os
+
+from fastchat.serve.monitor.monitor import recompute_final_ranking
+
+copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL")
+
+if not copilot_arena_leaderboard_url:
+    raise ValueError(
+        "COPILOT_ARENA_LEADERBOARD_URL environment variable is not set. "
+        "Please configure it to a valid URL."
+    )
+
+
+def process_copilot_arena_leaderboard(leaderboard):
+    leaderboard = leaderboard.copy().loc[leaderboard["visibility"] == "public"]
+    leaderboard["score"] = leaderboard["score"].round().astype(int)
+    leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int)
+    leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int)
+
+    leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"]
+    leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"]
+
+    leaderboard["confidence_interval"] = (
+        "+"
+        + leaderboard["upper_diff"].astype(str)
+        + " / -"
+        + leaderboard["lower_diff"].astype(str)
+    )
+
+    rankings_ub = recompute_final_ranking(leaderboard)
+    leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub)
+
+    leaderboard = leaderboard.sort_values(
+        by=["Rank* (UB)", "score"], ascending=[True, False]
+    )
+
+    return leaderboard
+
+
+def build_copilot_arena_tab():
+    if copilot_arena_leaderboard_url is None:
+        print("Copilot Arena Leaderboard URL is not set. Skipping this leaderboard.")
+        return
+    response = requests.get(copilot_arena_leaderboard_url)
+    if response.status_code == 200:
+        leaderboard = pd.DataFrame(response.json()["elo_data"])
+        leaderboard = process_copilot_arena_leaderboard(leaderboard)
+        leaderboard = leaderboard.rename(
+            columns={
+                "name": "Model",
+                "confidence_interval": "Confidence Interval",
+                "score": "Arena Score",
+                "organization": "Organization",
+                "votes": "Votes",
+            }
+        )
+
+        column_order = [
+            "Rank* (UB)",
+            "Model",
+            "Arena Score",
+            "Confidence Interval",
+            "Votes",
+            "Organization",
+        ]
+        leaderboard = leaderboard[column_order]
+        num_models = len(leaderboard)
+        total_battles = int(leaderboard["Votes"].sum()) // 2
+        md = f"""
+        [Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
+        """
+
+        gr.Markdown(md, elem_id="leaderboard_markdown")
+        gr.DataFrame(
+            leaderboard,
+            datatype=["str" for _ in leaderboard.columns],
+            elem_id="arena_hard_leaderboard",
+            height=600,
+            wrap=True,
+            interactive=False,
+            column_widths=[70, 130, 60, 80, 50, 80],
+        )
+
+        gr.Markdown(
+            """
+    ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
+    Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n
+    **Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound.
+    """,
+            elem_id="leaderboard_markdown",
+        )
+    else:
+        gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
index c07ee4669..9f7e0dd08 100644
--- a/fastchat/serve/monitor/monitor.py
+++ b/fastchat/serve/monitor/monitor.py
@@ -1034,6 +1034,15 @@ def build_leaderboard_tab(
                 build_full_leaderboard_tab(
                     elo_results_text, model_table_df, model_to_score
                 )
+            try:
+                with gr.Tab("Copilot Arena Leaderboard", id=5):
+                    from fastchat.serve.monitor.copilot_arena import (
+                        build_copilot_arena_tab,
+                    )
+
+                    build_copilot_arena_tab()
+            except Exception as e:
+                print(f"Unable to build Copilot Arena's Leaderboard. Error: {e}")
 
         if not show_plot:
             gr.Markdown(