Update monitor.py (#3627)

lm-sys · Nov 20, 2024 · 6bfe107 · 6bfe107
1 parent 78d7784
commit 6bfe107
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 39 deletions.
diff --git a/fastchat/constants.py b/fastchat/constants.py
@@ -9,8 +9,8 @@
 
 # Survey Link URL (to be removed) #00729c
 SURVEY_LINK = """<div style='text-align: left; margin: 20px 0;'>
-    <div style='display: inline-block; border: 2px solid #C41E3A; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
-        <span style='color: #C41E3A; font-weight: bold;'>New Launch! Jailbreak models at <a href='https://redarena.ai' style='color: #C41E3A; text-decoration: underline;'>RedTeam Arena</a>. </span>
+    <div style='display: inline-block; border: 2px solid #00729c; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
+        <span style='color: #00729c; font-weight: bold;'>New Launch! Copilot Arena: <a href='https://marketplace.visualstudio.com/items?itemName=copilot-arena.copilot-arena' style='color: #00729c; text-decoration: underline;'>VS Code Extension</a> to compare Top LLMs</span>
     </div>
 </div>"""
 # SURVEY_LINK = ""

diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
@@ -54,19 +54,16 @@
 
 
 def recompute_final_ranking(arena_df):
-    # compute ranking based on CI
-    ranking = {}
-    for i, model_a in enumerate(arena_df.index):
-        ranking[model_a] = 1
-        for j, model_b in enumerate(arena_df.index):
-            if i == j:
-                continue
-            if (
-                arena_df.loc[model_b]["rating_q025"]
-                > arena_df.loc[model_a]["rating_q975"]
-            ):
-                ranking[model_a] += 1
-    return list(ranking.values())
+    q025 = arena_df["rating_q025"].values
+    q975 = arena_df["rating_q975"].values
+
+    sorted_q025 = np.sort(q025)
+    insertion_indices = np.searchsorted(sorted_q025, q975, side="right")
+    counts = len(sorted_q025) - insertion_indices
+
+    rankings = 1 + counts
+    ranking_series = pd.Series(rankings, index=arena_df.index)
+    return ranking_series.tolist()
 
 
 def arena_hard_title(date):
@@ -81,22 +78,6 @@ def arena_hard_title(date):
     return arena_hard_title
 
 
-def recompute_final_ranking(arena_df):
-    # compute ranking based on CI
-    ranking = {}
-    for i, model_a in enumerate(arena_df.index):
-        ranking[model_a] = 1
-        for j, model_b in enumerate(arena_df.index):
-            if i == j:
-                continue
-            if (
-                arena_df.loc[model_b]["rating_q025"]
-                > arena_df.loc[model_a]["rating_q975"]
-            ):
-                ranking[model_a] += 1
-    return list(ranking.values())
-
-
 def update_elo_components(
     max_num_files, elo_results_file, ban_ip_file, exclude_model_names
 ):
@@ -861,14 +842,15 @@ def build_category_leaderboard_tab(
     "full_style_control",
     "hard_6",
     "hard_6_style_control",
-    "if",
     "coding",
     "math",
-    "multiturn",
+    "creative_writing",
+    "if",
     "long_user",
+    "multiturn",
     # "no_refusal",
 ]
-selected_categories_width = [110, 110, 110, 110, 110, 80, 80, 80, 80]
+selected_categories_width = [110, 110, 110, 110, 80, 80, 80, 110, 80, 80]
 
 language_categories = [
     "english",

diff --git a/fastchat/serve/monitor/monitor_md.py b/fastchat/serve/monitor/monitor_md.py
@@ -9,6 +9,7 @@
     "gemini-1.5-pro-api-0409-preview",
     "bard-jan-24-gemini-pro",
     "chatgpt-4o-latest-20240808",
+    "chatgpt-4o-latest-20240903",
 ]
 
 key_to_category_name = {
@@ -18,11 +19,12 @@
     "math": "Math",
     "if": "Instruction Following",
     "multiturn": "Multi-Turn",
+    "creative_writing": "Creative Writing",
     "coding": "Coding",
     "coding_style_control": "Coding w/ Style Control",
-    "hard_6": "Hard Prompts (Overall)",
+    "hard_6": "Hard Prompts",
     "hard_english_6": "Hard Prompts (English)",
-    "hard_6_style_control": "Hard Prompts (Overall) w/ Style Control",
+    "hard_6_style_control": "Hard Prompts w/ Style Control",
     "long_user": "Longer Query",
     "english": "English",
     "chinese": "Chinese",
@@ -47,8 +49,8 @@
     "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
     "Coding": "Coding: whether conversation contains code snippets",
     "Coding w/ Style Control": "Coding with Style Control",
-    "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
-    "Hard Prompts (Overall) w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
+    "Hard Prompts": "Hard Prompts: details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
+    "Hard Prompts w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
     "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
     "Longer Query": "Longer Query (>= 500 tokens)",
     "English": "English Prompts",
@@ -64,6 +66,7 @@
     "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
     "overall_limit_5_user_vote": "overall_limit_5_user_vote",
     "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
+    "Creative Writing": "Creative Writing",
 }
 cat_name_to_baseline = {
     "Hard Prompts (English)": "English",
@@ -81,7 +84,7 @@ def make_default_md_1(mirror=False):
     link_color = "#1976D2"  # This color should be clear in both light and dark mode
     leaderboard_md = f"""
     # 🏆 Chatbot Arena LLM Leaderboard: Community-driven Evaluation for Best LLM and AI chatbots
-    [Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
+    [Twitter](https://twitter.com/lmarena_ai) | [Discord](https://discord.gg/6GXcFg3TH8) | [Blog](https://blog.lmarena.ai/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Kaggle Competition](https://www.kaggle.com/competitions/wsdm-cup-multilingual-chatbot-arena)
     """
 
     return leaderboard_md