Skip to content

Commit

Permalink
Update monitor.py (#3627)
Browse files Browse the repository at this point in the history
  • Loading branch information
infwinston authored Nov 20, 2024
1 parent 78d7784 commit 6bfe107
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 39 deletions.
4 changes: 2 additions & 2 deletions fastchat/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

# Survey Link URL (to be removed) #00729c
SURVEY_LINK = """<div style='text-align: left; margin: 20px 0;'>
<div style='display: inline-block; border: 2px solid #C41E3A; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
<span style='color: #C41E3A; font-weight: bold;'>New Launch! Jailbreak models at <a href='https://redarena.ai' style='color: #C41E3A; text-decoration: underline;'>RedTeam Arena</a>. </span>
<div style='display: inline-block; border: 2px solid #00729c; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
<span style='color: #00729c; font-weight: bold;'>New Launch! Copilot Arena: <a href='https://marketplace.visualstudio.com/items?itemName=copilot-arena.copilot-arena' style='color: #00729c; text-decoration: underline;'>VS Code Extension</a> to compare Top LLMs</span>
</div>
</div>"""
# SURVEY_LINK = ""
Expand Down
46 changes: 14 additions & 32 deletions fastchat/serve/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,16 @@


def recompute_final_ranking(arena_df):
# compute ranking based on CI
ranking = {}
for i, model_a in enumerate(arena_df.index):
ranking[model_a] = 1
for j, model_b in enumerate(arena_df.index):
if i == j:
continue
if (
arena_df.loc[model_b]["rating_q025"]
> arena_df.loc[model_a]["rating_q975"]
):
ranking[model_a] += 1
return list(ranking.values())
q025 = arena_df["rating_q025"].values
q975 = arena_df["rating_q975"].values

sorted_q025 = np.sort(q025)
insertion_indices = np.searchsorted(sorted_q025, q975, side="right")
counts = len(sorted_q025) - insertion_indices

rankings = 1 + counts
ranking_series = pd.Series(rankings, index=arena_df.index)
return ranking_series.tolist()


def arena_hard_title(date):
Expand All @@ -81,22 +78,6 @@ def arena_hard_title(date):
return arena_hard_title


def recompute_final_ranking(arena_df):
# compute ranking based on CI
ranking = {}
for i, model_a in enumerate(arena_df.index):
ranking[model_a] = 1
for j, model_b in enumerate(arena_df.index):
if i == j:
continue
if (
arena_df.loc[model_b]["rating_q025"]
> arena_df.loc[model_a]["rating_q975"]
):
ranking[model_a] += 1
return list(ranking.values())


def update_elo_components(
max_num_files, elo_results_file, ban_ip_file, exclude_model_names
):
Expand Down Expand Up @@ -861,14 +842,15 @@ def build_category_leaderboard_tab(
"full_style_control",
"hard_6",
"hard_6_style_control",
"if",
"coding",
"math",
"multiturn",
"creative_writing",
"if",
"long_user",
"multiturn",
# "no_refusal",
]
selected_categories_width = [110, 110, 110, 110, 110, 80, 80, 80, 80]
selected_categories_width = [110, 110, 110, 110, 80, 80, 80, 110, 80, 80]

language_categories = [
"english",
Expand Down
13 changes: 8 additions & 5 deletions fastchat/serve/monitor/monitor_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"gemini-1.5-pro-api-0409-preview",
"bard-jan-24-gemini-pro",
"chatgpt-4o-latest-20240808",
"chatgpt-4o-latest-20240903",
]

key_to_category_name = {
Expand All @@ -18,11 +19,12 @@
"math": "Math",
"if": "Instruction Following",
"multiturn": "Multi-Turn",
"creative_writing": "Creative Writing",
"coding": "Coding",
"coding_style_control": "Coding w/ Style Control",
"hard_6": "Hard Prompts (Overall)",
"hard_6": "Hard Prompts",
"hard_english_6": "Hard Prompts (English)",
"hard_6_style_control": "Hard Prompts (Overall) w/ Style Control",
"hard_6_style_control": "Hard Prompts w/ Style Control",
"long_user": "Longer Query",
"english": "English",
"chinese": "Chinese",
Expand All @@ -47,8 +49,8 @@
"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
"Coding": "Coding: whether conversation contains code snippets",
"Coding w/ Style Control": "Coding with Style Control",
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Hard Prompts (Overall) w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
"Hard Prompts": "Hard Prompts: details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Hard Prompts w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Longer Query": "Longer Query (>= 500 tokens)",
"English": "English Prompts",
Expand All @@ -64,6 +66,7 @@
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
"Creative Writing": "Creative Writing",
}
cat_name_to_baseline = {
"Hard Prompts (English)": "English",
Expand All @@ -81,7 +84,7 @@ def make_default_md_1(mirror=False):
link_color = "#1976D2" # This color should be clear in both light and dark mode
leaderboard_md = f"""
# 🏆 Chatbot Arena LLM Leaderboard: Community-driven Evaluation for Best LLM and AI chatbots
[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
[Twitter](https://twitter.com/lmarena_ai) | [Discord](https://discord.gg/6GXcFg3TH8) | [Blog](https://blog.lmarena.ai/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Kaggle Competition](https://www.kaggle.com/competitions/wsdm-cup-multilingual-chatbot-arena)
"""

return leaderboard_md
Expand Down

0 comments on commit 6bfe107

Please sign in to comment.