Merge branch 'main' into moderation-log

lm-sys · Nov 28, 2024 · 4c9c98f · 4c9c98f
2 parents 87b6390 + 1cd4b74
commit 4c9c98f
Show file tree

Hide file tree

Showing 18 changed files with 579 additions and 308 deletions.
diff --git a/README.md b/README.md
@@ -237,6 +237,33 @@ This is the user interface that users will interact with.
 By following these steps, you will be able to serve your models using the web UI. You can open your browser and chat with a model now.
 If the models do not show up, try to reboot the gradio web server.
 
+## Launch Chatbot Arena (side-by-side battle UI)
+
+Currently, Chatbot Arena is powered by FastChat. Here is how you can launch an instance of Chatbot Arena locally.
+
+FastChat supports popular API-based models such as OpenAI, Anthropic, Gemini, Mistral and more. To add a custom API, please refer to the model support [doc](./docs/model_support.md). Below we take OpenAI models as an example.
+
+Create a JSON configuration file `api_endpoint.json` with the api endpoints of the models you want to serve, for example:
+```
+{
+    "gpt-4o-2024-05-13": {
+        "model_name": "gpt-4o-2024-05-13",
+        "api_base": "https://api.openai.com/v1",
+        "api_type": "openai",
+        "api_key": [Insert API Key],
+        "anony_only": false
+    }
+}
+```
+For Anthropic models, specify `"api_type": "anthropic_message"` with your Anthropic key. Similarly, for gemini model, specify `"api_type": "gemini"`. More details can be found in [api_provider.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/api_provider.py).
+
+To serve your own model using local gpus, follow the instructions in [Serving with Web GUI](#serving-with-web-gui).
+
+Now you're ready to launch the server:
+```
+python3 -m fastchat.serve.gradio_web_server_multi --register-api-endpoint-file api_endpoint.json
+```
+
 #### (Optional): Advanced Features, Scalability, Third Party UI
 - You can register multiple model workers to a single controller, which can be used for serving a single model with higher throughput or serving multiple models at the same time. When doing so, please allocate different GPUs and ports for different model workers.
 ```

diff --git a/fastchat/constants.py b/fastchat/constants.py
@@ -9,8 +9,8 @@
 
 # Survey Link URL (to be removed) #00729c
 SURVEY_LINK = """<div style='text-align: left; margin: 20px 0;'>
-    <div style='display: inline-block; border: 2px solid #C41E3A; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
-        <span style='color: #C41E3A; font-weight: bold;'>New Launch! Jailbreak models at <a href='https://redarena.ai' style='color: #C41E3A; text-decoration: underline;'>RedTeam Arena</a>. </span>
+    <div style='display: inline-block; border: 2px solid #00729c; padding: 20px; padding-bottom: 10px; padding-top: 10px; border-radius: 5px;'>
+        <span style='color: #00729c; font-weight: bold;'>New Launch! Copilot Arena: <a href='https://marketplace.visualstudio.com/items?itemName=copilot-arena.copilot-arena' style='color: #00729c; text-decoration: underline;'>VS Code Extension</a> to compare Top LLMs</span>
     </div>
 </div>"""
 # SURVEY_LINK = ""

diff --git a/fastchat/serve/api_provider.py b/fastchat/serve/api_provider.py
@@ -122,12 +122,14 @@ def get_api_provider_stream_iter(
         )
     elif model_api_dict["api_type"] == "bard":
         prompt = conv.to_openai_api_messages()
-        stream_iter = bard_api_stream_iter(
+        stream_iter = gemini_api_stream_iter(
             model_api_dict["model_name"],
             prompt,
-            temperature,
-            top_p,
-            api_key=model_api_dict["api_key"],
+            None,  # use Bard's default temperature
+            None,  # use Bard's default top_p
+            max_new_tokens,
+            api_key=(model_api_dict["api_key"] or os.environ["BARD_API_KEY"]),
+            use_stream=False,
         )
     elif model_api_dict["api_type"] == "mistral":
         if model_api_dict.get("vision-arena", False):
@@ -242,6 +244,7 @@ def get_api_provider_stream_iter(
             max_new_tokens,
             api_base=model_api_dict["api_base"],
             api_key=model_api_dict["api_key"],
+            conversation_id=state.conv_id,
         )
     else:
         raise NotImplementedError()
@@ -759,75 +762,6 @@ def gemini_api_stream_iter(
             }
 
 
-def bard_api_stream_iter(model_name, conv, temperature, top_p, api_key=None):
-    del top_p  # not supported
-    del temperature  # not supported
-
-    if api_key is None:
-        api_key = os.environ["BARD_API_KEY"]
-
-    # convert conv to conv_bard
-    conv_bard = []
-    for turn in conv:
-        if turn["role"] == "user":
-            conv_bard.append({"author": "0", "content": turn["content"]})
-        elif turn["role"] == "assistant":
-            conv_bard.append({"author": "1", "content": turn["content"]})
-        else:
-            raise ValueError(f"Unsupported role: {turn['role']}")
-
-    params = {
-        "model": model_name,
-        "prompt": conv_bard,
-    }
-    logger.info(f"==== request ====\n{params}")
-
-    try:
-        res = requests.post(
-            f"https://generativelanguage.googleapis.com/v1beta2/models/{model_name}:generateMessage?key={api_key}",
-            json={
-                "prompt": {
-                    "messages": conv_bard,
-                },
-            },
-            timeout=60,
-        )
-    except Exception as e:
-        logger.error(f"==== error ====\n{e}")
-        yield {
-            "text": f"**API REQUEST ERROR** Reason: {e}.",
-            "error_code": 1,
-        }
-
-    if res.status_code != 200:
-        logger.error(f"==== error ==== ({res.status_code}): {res.text}")
-        yield {
-            "text": f"**API REQUEST ERROR** Reason: status code {res.status_code}.",
-            "error_code": 1,
-        }
-
-    response_json = res.json()
-    if "candidates" not in response_json:
-        logger.error(f"==== error ==== response blocked: {response_json}")
-        reason = response_json["filters"][0]["reason"]
-        yield {
-            "text": f"**API REQUEST ERROR** Reason: {reason}.",
-            "error_code": 1,
-        }
-
-    response = response_json["candidates"][0]["content"]
-    pos = 0
-    while pos < len(response):
-        # simulate token streaming
-        pos += 5
-        time.sleep(0.001)
-        data = {
-            "text": response[:pos],
-            "error_code": 0,
-        }
-        yield data
-
-
 def ai2_api_stream_iter(
     model_name,
     model_id,
@@ -1262,6 +1196,7 @@ def metagen_api_stream_iter(
     max_new_tokens,
     api_key,
     api_base,
+    conversation_id,
 ):
     try:
         text_messages = []
@@ -1294,6 +1229,7 @@ def metagen_api_stream_iter(
                 "model": model_name,
                 "chunks_delimited": True,
                 "messages": messages,
+                "conversation_id": conversation_id,
                 "options": {
                     "max_tokens": max_new_tokens,
                     "generation_algorithm": "top_p",

diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
@@ -60,11 +60,11 @@ def load_demo_side_by_side_anony(models_, url_params):
     global models
     models = models_
 
-    states = (None,) * num_sides
-    selector_updates = (
+    states = [None] * num_sides
+    selector_updates = [
         gr.Markdown(visible=True),
         gr.Markdown(visible=True),
-    )
+    ]
 
     return states + selector_updates
 
@@ -522,6 +522,12 @@ def build_side_by_side_ui_anony(models):
                         elem_id="chatbot",
                         height=650,
                         show_copy_button=True,
+                        latex_delimiters=[
+                            {"left": "$", "right": "$", "display": False},
+                            {"left": "$$", "right": "$$", "display": True},
+                            {"left": r"\(", "right": r"\)", "display": False},
+                            {"left": r"\[", "right": r"\]", "display": True},
+                        ],
                     )
 
         with gr.Row():

diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
@@ -50,7 +50,7 @@ def set_global_vars_named(enable_moderation_, use_remote_storage_):
 
 
 def load_demo_side_by_side_named(models, url_params):
-    states = (None,) * num_sides
+    states = [None] * num_sides
 
     model_left = models[0] if len(models) > 0 else ""
     if len(models) > 1:
@@ -60,10 +60,10 @@ def load_demo_side_by_side_named(models, url_params):
     else:
         model_right = model_left
 
-    selector_updates = (
+    selector_updates = [
         gr.Dropdown(choices=models, value=model_left, visible=True),
         gr.Dropdown(choices=models, value=model_right, visible=True),
-    )
+    ]
 
     return states + selector_updates
 
@@ -409,6 +409,12 @@ def build_side_by_side_ui_named(models):
                         elem_id=f"chatbot",
                         height=650,
                         show_copy_button=True,
+                        latex_delimiters=[
+                            {"left": "$", "right": "$", "display": False},
+                            {"left": "$$", "right": "$$", "display": True},
+                            {"left": r"\(", "right": r"\)", "display": False},
+                            {"left": r"\[", "right": r"\]", "display": True},
+                        ],
                     )
 
     with gr.Row():

diff --git a/fastchat/serve/gradio_block_arena_vision.py b/fastchat/serve/gradio_block_arena_vision.py
@@ -346,6 +346,12 @@ def build_single_vision_language_model_ui(
                 label="Scroll down and start chatting",
                 height=650,
                 show_copy_button=True,
+                latex_delimiters=[
+                    {"left": "$", "right": "$", "display": False},
+                    {"left": "$$", "right": "$$", "display": True},
+                    {"left": r"\(", "right": r"\)", "display": False},
+                    {"left": r"\[", "right": r"\]", "display": True},
+                ],
             )
 
     with gr.Row():

diff --git a/fastchat/serve/gradio_block_arena_vision_anony.py b/fastchat/serve/gradio_block_arena_vision_anony.py
@@ -474,6 +474,12 @@ def build_side_by_side_vision_ui_anony(context: Context, random_questions=None):
                                 elem_id="chatbot",
                                 height=650,
                                 show_copy_button=True,
+                                latex_delimiters=[
+                                    {"left": "$", "right": "$", "display": False},
+                                    {"left": "$$", "right": "$$", "display": True},
+                                    {"left": r"\(", "right": r"\)", "display": False},
+                                    {"left": r"\[", "right": r"\]", "display": True},
+                                ],
                             )
 
                 with gr.Row():

diff --git a/fastchat/serve/gradio_block_arena_vision_named.py b/fastchat/serve/gradio_block_arena_vision_named.py
@@ -409,6 +409,12 @@ def build_side_by_side_vision_ui_named(context: Context, random_questions=None):
                                 elem_id=f"chatbot",
                                 height=650,
                                 show_copy_button=True,
+                                latex_delimiters=[
+                                    {"left": "$", "right": "$", "display": False},
+                                    {"left": "$$", "right": "$$", "display": True},
+                                    {"left": r"\(", "right": r"\)", "display": False},
+                                    {"left": r"\[", "right": r"\]", "display": True},
+                                ],
                             )
 
     with gr.Row():

diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
@@ -942,6 +942,12 @@ def build_single_model_ui(models, add_promotion_links=False):
             label="Scroll down and start chatting",
             height=650,
             show_copy_button=True,
+            latex_delimiters=[
+                {"left": "$", "right": "$", "display": False},
+                {"left": "$$", "right": "$$", "display": True},
+                {"left": r"\(", "right": r"\)", "display": False},
+                {"left": r"\[", "right": r"\]", "display": True},
+            ],
         )
     with gr.Row():
         textbox = gr.Textbox(

diff --git a/fastchat/serve/monitor/classify/README.md b/fastchat/serve/monitor/classify/README.md
@@ -0,0 +1,60 @@
+## Download dataset
+We have pre-generated several category classifier benchmarks and ground truths. You can download them (with [`git-lfs`](https://git-lfs.com) installed) to the directory `classify/` by running
+```console
+> git clone https://huggingface.co/datasets/lmarena-ai/categories-benchmark-eval
+// cd into classify/ and then copy the label_bench directory to the current directory
+> cp -r categories-benchmark-eval/label_bench . 
+```
+Your label_bench directory should follow the structure:
+```markdown
+├── label_bench/
+│   ├── creative_writing_bench/
+│   │   ├── data/
+│   │   │    └── llama-v3p1-70b-instruct.json
+│   │   └── test.json
+│   ├── ...
+│   ├── your_bench_name/
+│   │   ├── data/
+│   │   │    ├── your_classifier_data_1.json
+│   │   │    ├── your_classifier_data_2.json
+│   │   │    └── ...
+│   │   └── test.json (your ground truth)
+└── ...
+```
+
+## How to evaluate your category classifier?
+
+To test your new classifier for a new category, you would have to make sure you created the category child class in `category.py`. Then, to generate classification labels, make the necessary edits in `config.yaml` and run
+```console
+python label.py --config config.yaml --testing
+```
+
+Then, add your new category bench to `tag_names` in `display_score.py`. After making sure that you also have a correctly formatted ground truth json file, you can report the performance of your classifier by running
+```console
+python display_score.py --bench <your_bench>
+```
+
+If you want to check out conflicts between your classifier and ground truth, use
+```console
+python display_score.py --bench <your_bench> --display-conflict
+```
+
+Example output:
+```console
+> python display_score.py --bench if_bench --display-conflict
+Model: gpt-4o-mini-2024-07-18
+Accuracy: 0.967
+Precision: 0.684
+Recall: 0.918
+
+###### CONFLICT ######
+
+Ground Truth = True; Pred = False
+\####################
+...
+
+Ground Truth = False; Pred = True
+\####################
+...
+```
+
diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
@@ -24,6 +24,8 @@ def create_category(name):
             return CategoryIF()
         elif name == "math_v0.1":
             return CategoryMath()
+        elif name == "creative_writing_v0.1":
+            return CategoryCreativeWriting()
 
         raise Exception(f"Category name is incorrect: {name}")
 
@@ -134,3 +136,41 @@ def pre_process(self, prompt):
     def post_process(self, judgment):
         score = self.get_score(judgment=judgment)
         return {"math": bool(score == "yes") if score else False}
+
+
+class CategoryCreativeWriting(Category):
+    def __init__(self):
+        super().__init__()
+        self.name_tag = "creative_writing_v0.1"
+        self.pattern = re.compile(r"<decision>(\w+)<\/decision>")
+        self.system_prompt = 'You are tasked with determining whether a given user prompt is asking for creative writing. Creative writing is defined as any form of writing that goes beyond standard professional, journalistic, academic, or technical literature. It typically involves imagination, originality, and expression of thoughts and emotions. Creative writing can include, but is not limited to, the following formats:\n- Fiction (e.g., short stories, novels)\n- Poetry (e.g., sonnets, free verse)\n- Dramatic writing (e.g., screenplays, monologues, scripts)\n- Personal essays (focusing on subjective experiences or narrative storytelling)\n- Songs and lyrics\n\nCarefully analyze the user prompt and consider whether it primarily requires creative writing. Think about the following aspects:\n1. Does the prompt ask for fictional content, speculative scenarios, or the use of imagination to construct narratives?\n2. Does it encourage the expression of thoughts, emotions, or personal experiences beyond mere factual reporting or analysis?\n3. Is it asking for writing in a specific creative format (e.g., story, poem, script, etc)?\n4. Is the primary purpose of the prompt to foster creative expression or originality rather than information delivery, technical documentation, or analytical reasoning?\n5. Does the prompt request stylistic or rhetorical elements often associated with creative writing, such as metaphor, imagery, dialogue, etc?\n6. Does the prompt expect a response in natural language (e.g., sentences, paragraphs) rather than visual, mathematical, or non-linguistic output?\n\nOutput your verdict as either "yes" or "no"in the following format:\n<decision>\n[yes/no]\n</decision>. Do NOT explain.'
+        self.prompt_template = "<user_prompt>\n{PROMPT}\n</user_prompt>"
+
+    def get_score(self, judgment):
+        matches = self.pattern.findall(
+            judgment.replace("\n", "")
+            .replace("[", "")
+            .replace("]", "")
+            .replace(" ", "")
+            .lower()
+        )
+        matches = [m for m in matches if m != ""]
+        if len(set(matches)) == 0:
+            return None
+        elif len(set(matches)) == 1:
+            return matches[0]
+        else:
+            return None
+
+    def pre_process(self, prompt):
+        args = {"PROMPT": prompt}
+        conv = [
+            {"role": "system", "content": self.system_prompt},
+            {"role": "user", "content": self.prompt_template.format(**args)},
+        ]
+        return conv
+
+    def post_process(self, judgment):
+        score = self.get_score(judgment=judgment)
+        bool_score = bool(score == "yes") if score else False
+        return {"creative_writing": bool_score, "score": score}
diff --git a/fastchat/serve/monitor/classify/config.yaml b/fastchat/serve/monitor/classify/config.yaml
@@ -10,6 +10,7 @@ task_name:
   - criteria_v0.1
   - if_v0.1
   - math_v0.1
+  - creative_writing_v0.1
 
 model_name: null
 name: llama-3-70b-instruct