From 2264580336da281ab47a2f35975d3f918161ca92 Mon Sep 17 00:00:00 2001
From: Trangle <kw_w@foxmail.com>
Date: Fri, 1 Sep 2023 09:34:32 +0800
Subject: [PATCH 01/45] Remove hardcode flash-attn disable setting (#2342)

---
 fastchat/model/model_adapter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index afe79a6bf..8c2fbde32 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -1339,7 +1339,8 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
             model_path,
             trust_remote_code=True,
         )
-        config.use_flash_attn = False
+        # NOTE: if you use the old version of model file, please remove the comments below
+        # config.use_flash_attn = False
         config.fp16 = True
         generation_config = GenerationConfig.from_pretrained(
             model_path, trust_remote_code=True

From 24a8755b2a8e2f65e43051b95d74e00396969d51 Mon Sep 17 00:00:00 2001
From: Nathan Stitt <nathan@stitt.org>
Date: Thu, 31 Aug 2023 20:35:09 -0500
Subject: [PATCH 02/45] Document turning off proxy_buffering when api is
 streaming (#2337)

---
 docs/openai_api.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/openai_api.md b/docs/openai_api.md
index f69cc4f00..0c555a60e 100644
--- a/docs/openai_api.md
+++ b/docs/openai_api.md
@@ -62,7 +62,7 @@ completion = openai.ChatCompletion.create(
 print(completion.choices[0].message.content)
 ```
 
-Streaming is also supported. See [test_openai_api.py](../tests/test_openai_api.py).
+Streaming is also supported. See [test_openai_api.py](../tests/test_openai_api.py).  If your api server is behind a proxy you'll need to turn off buffering, you can do so in Nginx by setting `proxy_buffering off;` in the location block for the proxy.
 
 ### cURL
 cURL is another good tool for observing the output of the api.

From b039a66189f94f246f57e85f916a6667d1e3995d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 4 Sep 2023 01:34:30 -0700
Subject: [PATCH 03/45] Simplify huggingface api example (#2355)

---
 fastchat/serve/huggingface_api.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fastchat/serve/huggingface_api.py b/fastchat/serve/huggingface_api.py
index 47dcb87b1..7bf16a882 100644
--- a/fastchat/serve/huggingface_api.py
+++ b/fastchat/serve/huggingface_api.py
@@ -6,10 +6,8 @@
 python3 -m fastchat.serve.huggingface_api --model lmsys/fastchat-t5-3b-v1.0
 """
 import argparse
-import json
 
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 
 from fastchat.model import load_model, get_conversation_template, add_model_args
 
@@ -34,8 +32,7 @@ def main(args):
     conv.append_message(conv.roles[1], None)
     prompt = conv.get_prompt()
 
-    inputs = tokenizer([prompt])
-    inputs = {k: torch.tensor(v).to(args.device) for k, v in inputs.items()}
+    inputs = tokenizer([prompt], return_tensors="pt").to(args.device)
     output_ids = model.generate(
         **inputs,
         do_sample=True if args.temperature > 1e-5 else False,

From ea045e6de498c46c532ab5cea14bc3227cefbcfb Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 5 Sep 2023 11:40:58 -0700
Subject: [PATCH 04/45] Update sponsor logos (#2367)

---
 fastchat/model/model_registry.py           |  2 +-
 fastchat/serve/gradio_block_arena_anony.py | 10 +++++-----
 fastchat/serve/gradio_web_server.py        | 20 +++++++++++++++++++-
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py
index 96e2e768d..92938abc9 100644
--- a/fastchat/model/model_registry.py
+++ b/fastchat/model/model_registry.py
@@ -81,7 +81,7 @@ def get_model_info(name: str) -> ModelInfo:
     "a chat assistant fine-tuned from LLaMA on user-shared conversations by LMSYS",
 )
 register_model_info(
-    ["wizardlm-13b"],
+    ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"],
     "WizardLM",
     "https://github.com/nlpxucan/WizardLM",
     "an instruction-following LLM using evol-instruct by Microsoft",
diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index e20bdcd78..978f76b75 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -174,17 +174,17 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "vicuna-33b": 1.5,
     "vicuna-13b": 1.5,
     "mpt-30b-chat": 1.5,
+    "wizardlm-70b": 1.5,
     "wizardlm-13b": 1.5,
     # tier 2
     "codellama-13b-instruct": 1.0,
-    "guanaco-33b": 1.0,
     "vicuna-7b": 1.0,
     "llama-2-7b-chat": 1.0,
-    # tier 3
+    "chatglm2-6b": 1.0,
+    # deprecated
+    "guanaco-33b": 1.0,
     "fastchat-t5-3b": 0.5,
     "alpaca-13b": 0.5,
-    "chatglm2-6b": 0.5,
-    # deprecated
     "mpt-7b-chat": 0.1,
     "oasst-pythia-12b": 0.1,
     "RWKV-4-Raven-14B": 0.1,
@@ -196,7 +196,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "chatglm-6b": 0.5,
 }
 
-SAMPLING_BOOST_MODELS = ["llama-2-70b-chat"]
+SAMPLING_BOOST_MODELS = ["llama-2-70b-chat", "codellama-34b-instruct"]
 
 model_pairs = []
 model_pairs_weights = []
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
index 29134dff4..c2e22e562 100644
--- a/fastchat/serve/gradio_web_server.py
+++ b/fastchat/serve/gradio_web_server.py
@@ -55,7 +55,13 @@
 enable_moderation = False
 
 acknowledgment_md = """
-**Acknowledgment:** We thank Kaggle, MBZUAI, and AnyScale for their sponsorship.
+<div class="image-container">
+    <p> <strong>Acknowledgment: </strong> We thank <a href="https://www.kaggle.com/" target="_blank">Kaggle</a>, <a href="https://mbzuai.ac.ae/" target="_blank">MBZUAI</a>, <a href="https://www.anyscale.com/" target="_blank">AnyScale</a>, and <a href="https://huggingface.co/" target="_blank">HuggingFace</a> for their sponsorship. </p>
+    <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/7c/Kaggle_logo.png/400px-Kaggle_logo.png" alt="Image 1">
+    <img src="https://mma.prnewswire.com/media/1227419/MBZUAI_Logo.jpg?p=facebookg" alt="Image 2">
+    <img src="https://docs.anyscale.com/site-assets/logo.png" alt="Image 3">
+    <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-title.png" alt="Image 4">
+</div>
 """
 
 ip_expiration_dict = defaultdict(lambda: 0)
@@ -497,6 +503,18 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request)
 footer {
     display:none !important
 }
+.image-container {
+    display: flex;
+    align-items: center;
+    padding: 1px;
+}
+.image-container img {
+    margin: 0 30px;
+    height: 20px;
+    max-height: 100%;
+    width: auto;
+    max-width: 20%;
+}
 """
 
 

From 85bec473b73c739ce5b499b810b11ed652bd59d2 Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Wed, 6 Sep 2023 02:41:42 +0800
Subject: [PATCH 05/45] if LOGDIR is empty, then don't try output log to local
 file (#2357)

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 fastchat/utils.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/fastchat/utils.py b/fastchat/utils.py
index 180cc35c8..25370eb17 100644
--- a/fastchat/utils.py
+++ b/fastchat/utils.py
@@ -57,18 +57,20 @@ def build_logger(logger_name, logger_filename):
     logger = logging.getLogger(logger_name)
     logger.setLevel(logging.INFO)
 
-    os.makedirs(LOGDIR, exist_ok=True)
-    filename = os.path.join(LOGDIR, logger_filename)
-    handler = logging.handlers.TimedRotatingFileHandler(
-        filename, when="D", utc=True, encoding="utf-8"
-    )
-    handler.setFormatter(formatter)
-
-    for l in [stdout_logger, stderr_logger, logger]:
-        if l in visited_loggers:
-            continue
-        visited_loggers.add(l)
-        l.addHandler(handler)
+    # if LOGDIR is empty, then don't try output log to local file
+    if LOGDIR != "":
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when="D", utc=True, encoding="utf-8"
+        )
+        handler.setFormatter(formatter)
+
+        for l in [stdout_logger, stderr_logger, logger]:
+            if l in visited_loggers:
+                continue
+            visited_loggers.add(l)
+            l.addHandler(handler)
 
     return logger
 

From f99663cc565c9db1aab20e34ce7f719765a16519 Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Wed, 6 Sep 2023 09:07:55 +0800
Subject: [PATCH 06/45] add best_of and use_beam_search for completions
 interface (#2348)

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 fastchat/protocol/api_protocol.py        |  2 +-
 fastchat/protocol/openai_api_protocol.py |  4 +-
 fastchat/serve/openai_api_server.py      | 29 +++++++++-
 fastchat/serve/vllm_worker.py            | 70 +++++++++++++++++-------
 4 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/fastchat/protocol/api_protocol.py b/fastchat/protocol/api_protocol.py
index 7dc8fe1c3..1091f5e5a 100644
--- a/fastchat/protocol/api_protocol.py
+++ b/fastchat/protocol/api_protocol.py
@@ -150,7 +150,7 @@ class CompletionResponse(BaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[CompletionResponseChoice]
-    usage: UsageInfo
+    usage: Union[UsageInfo, List[UsageInfo]]
 
 
 class CompletionResponseStreamChoice(BaseModel):
diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py
index 6232e8b9b..fc3c91ebd 100644
--- a/fastchat/protocol/openai_api_protocol.py
+++ b/fastchat/protocol/openai_api_protocol.py
@@ -151,11 +151,13 @@ class CompletionRequest(BaseModel):
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     user: Optional[str] = None
+    use_beam_search: Optional[bool] = False
+    best_of: Optional[int] = None
 
 
 class CompletionResponseChoice(BaseModel):
     index: int
-    text: str
+    text: Union[str, List[str]]
     logprobs: Optional[int] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
 
diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index 02e8481f4..1344ec46f 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -238,9 +238,12 @@ async def get_gen_params(
     *,
     temperature: float,
     top_p: float,
+    best_of: Optional[int],
     max_tokens: Optional[int],
+    n: Optional[int],
     echo: Optional[bool],
     stop: Optional[Union[str, List[str]]],
+    use_beam_search: Optional[bool],
 ) -> Dict[str, Any]:
     conv = await get_conv(model_name, worker_addr)
     conv = Conversation(
@@ -287,6 +290,11 @@ async def get_gen_params(
         "stop_token_ids": conv.stop_token_ids,
     }
 
+    if best_of:
+        gen_params.update({"n": n, "best_of": best_of})
+    if use_beam_search is not None:
+        gen_params.update({"use_beam_search": use_beam_search})
+
     new_stop = set()
     _add_to_set(stop, new_stop)
     _add_to_set(conv.stop_str, new_stop)
@@ -491,15 +499,21 @@ async def create_completion(request: CompletionRequest):
                 text,
                 temperature=request.temperature,
                 top_p=request.top_p,
+                best_of=request.best_of,
                 max_tokens=request.max_tokens,
+                n=request.n,
                 echo=request.echo,
                 stop=request.stop,
+                use_beam_search=request.use_beam_search,
             )
             for i in range(request.n):
                 content = asyncio.create_task(
                     generate_completion(gen_params, worker_addr)
                 )
                 text_completions.append(content)
+                # when use with best_of, only need send one request
+                if request.best_of:
+                    break
 
         try:
             all_tasks = await asyncio.gather(*text_completions)
@@ -519,9 +533,18 @@ async def create_completion(request: CompletionRequest):
                     finish_reason=content.get("finish_reason", "stop"),
                 )
             )
-            task_usage = UsageInfo.parse_obj(content["usage"])
-            for usage_key, usage_value in task_usage.dict().items():
-                setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
+            idx = 0
+            while True:
+                info = content["usage"]
+                if isinstance(info, list):
+                    info = info[idx]
+
+                task_usage = UsageInfo.parse_obj(info)
+
+                for usage_key, usage_value in task_usage.dict().items():
+                    setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
+                idx += 1
+                break
 
         return CompletionResponse(
             model=request.model, choices=choices, usage=UsageInfo.parse_obj(usage)
diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index 8e255b79c..71a30f890 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -18,6 +18,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
+from fastchat.constants import ErrorCode, SERVER_ERROR_MSG
 from fastchat.serve.model_worker import (
     BaseModelWorker,
     logger,
@@ -74,6 +75,9 @@ async def generate_stream(self, params):
         if self.tokenizer.eos_token_id is not None:
             stop_token_ids.append(self.tokenizer.eos_token_id)
         echo = params.get("echo", True)
+        use_beam_search = params.get("use_beam_search", False)
+        best_of = params.get("best_of", None)
+        n = params.get("n", 1)
 
         # Handle stop_str
         stop = set()
@@ -90,27 +94,51 @@ async def generate_stream(self, params):
         top_p = max(top_p, 1e-5)
         if temperature <= 1e-5:
             top_p = 1.0
-        sampling_params = SamplingParams(
-            n=1,
-            temperature=temperature,
-            top_p=top_p,
-            use_beam_search=False,
-            stop=list(stop),
-            max_tokens=max_new_tokens,
-        )
-        results_generator = engine.generate(context, sampling_params, request_id)
-
-        async for request_output in results_generator:
-            prompt = request_output.prompt
-            if echo:
-                text_outputs = [
-                    prompt + output.text for output in request_output.outputs
-                ]
-            else:
-                text_outputs = [output.text for output in request_output.outputs]
-            text_outputs = " ".join(text_outputs)
-            # Note: usage is not supported yet
-            ret = {"text": text_outputs, "error_code": 0, "usage": {}}
+        try:
+            sampling_params = SamplingParams(
+                n=n,
+                temperature=temperature,
+                top_p=top_p,
+                use_beam_search=use_beam_search,
+                stop=list(stop),
+                max_tokens=max_new_tokens,
+                best_of=best_of,
+            )
+
+            results_generator = engine.generate(context, sampling_params, request_id)
+
+            async for request_output in results_generator:
+                prompt = request_output.prompt
+                prompt_tokens = len(request_output.prompt_token_ids)
+                output_usage = []
+                for out in request_output.outputs:
+                    completion_tokens = len(out.token_ids)
+                    total_tokens = prompt_tokens + completion_tokens
+                    output_usage.append(
+                        {
+                            "prompt_tokens": prompt_tokens,
+                            "completion_tokens": completion_tokens,
+                            "total_tokens": total_tokens,
+                        }
+                    )
+
+                if echo:
+                    text_outputs = [
+                        prompt + output.text for output in request_output.outputs
+                    ]
+                else:
+                    text_outputs = [output.text for output in request_output.outputs]
+
+                if sampling_params.best_of is None:
+                    text_outputs = [" ".join(text_outputs)]
+                ret = {"text": text_outputs, "error_code": 0, "usage": output_usage}
+                yield (json.dumps(ret) + "\0").encode()
+        except (ValueError, RuntimeError) as e:
+            ret = {
+                "text": f"{e}",
+                "error_code": ErrorCode.PARAM_OUT_OF_RANGE,
+                "usage": {},
+            }
             yield (json.dumps(ret) + "\0").encode()
 
     async def generate(self, params):

From 3cf04c2591c2cbc251f2f8e02aa0e40eb380790a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 5 Sep 2023 19:50:15 -0700
Subject: [PATCH 07/45] Extract upvote/downvote from log files (#2369)

---
 fastchat/serve/monitor/basic_stats.py      |  2 +-
 fastchat/serve/monitor/clean_chat_data.py  | 25 +++++++++++------
 fastchat/serve/monitor/topic_clustering.py | 32 ++++++++++++++++++++--
 3 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/fastchat/serve/monitor/basic_stats.py b/fastchat/serve/monitor/basic_stats.py
index b57e0913c..e1934bb07 100644
--- a/fastchat/serve/monitor/basic_stats.py
+++ b/fastchat/serve/monitor/basic_stats.py
@@ -17,7 +17,7 @@
 
 def get_log_files(max_num_files=None):
     dates = []
-    for month in range(4, 9):
+    for month in range(4, 12):
         for day in range(1, 33):
             dates.append(f"2023-{month:02d}-{day:02d}")
 
diff --git a/fastchat/serve/monitor/clean_chat_data.py b/fastchat/serve/monitor/clean_chat_data.py
index 76b4da50d..54e7b3e39 100644
--- a/fastchat/serve/monitor/clean_chat_data.py
+++ b/fastchat/serve/monitor/clean_chat_data.py
@@ -48,7 +48,7 @@ def get_log_files(max_num_files=None):
     return filenames
 
 
-def clean_chat_data(log_files):
+def clean_chat_data(log_files, action_type):
     raw_data = []
     for filename in tqdm(log_files, desc="read files"):
         for retry in range(5):
@@ -60,7 +60,7 @@ def clean_chat_data(log_files):
 
         for l in lines:
             row = json.loads(l)
-            if row["type"] == "chat":
+            if row["type"] == action_type:
                 raw_data.append(row)
 
     all_models = set()
@@ -70,18 +70,26 @@ def clean_chat_data(log_files):
     ct_invalid = 0
     ct_network_error = 0
     for row in raw_data:
-        if "conv_id" not in row["state"]:
+        try:
+            if action_type in ["chat", "upvote", "downvote"]:
+                state = row["state"]
+                model = row["model"]
+            elif action_type == "leftvote":
+                state = row["states"][0]
+                model = row["states"][0]["model_name"]
+            elif action_type == "rightvote":
+                state = row["states"][1]
+                model = row["states"][1]["model_name"]
+            conversation_id = state["conv_id"]
+        except KeyError:
             ct_invalid_conv_id += 1
             continue
 
-        conversation_id = row["state"]["conv_id"]
         if conversation_id is None:
             ct_invalid_conv_id += 1
             continue
 
-        state = row["state"]
         conversation = to_openai_format(state["messages"][state["offset"] :])
-        model = row["model"]
         if not isinstance(model, str):
             ct_invalid += 1
             continue
@@ -150,17 +158,18 @@ def clean_chat_data(log_files):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--action-type", type=str, default="chat")
     parser.add_argument("--max-num-files", type=int)
     args = parser.parse_args()
 
     log_files = get_log_files(args.max_num_files)
-    chats = clean_chat_data(log_files)
+    chats = clean_chat_data(log_files, args.action_type)
     last_updated_tstamp = chats[-1]["tstamp"]
     cutoff_date = datetime.datetime.fromtimestamp(
         last_updated_tstamp, tz=timezone("US/Pacific")
     ).strftime("%Y%m%d")
 
-    output = f"clean_chat_conv_{cutoff_date}.json"
+    output = f"clean_{args.action_type}_conv_{cutoff_date}.json"
     with open(output, "w") as fout:
         json.dump(chats, fout, indent=2, ensure_ascii=False)
     print(f"Write cleaned data to {output}")
diff --git a/fastchat/serve/monitor/topic_clustering.py b/fastchat/serve/monitor/topic_clustering.py
index fcc45b623..7710ce42a 100644
--- a/fastchat/serve/monitor/topic_clustering.py
+++ b/fastchat/serve/monitor/topic_clustering.py
@@ -124,7 +124,30 @@ def run_agg_cluster(embeddings, num_clusters):
 
     # Compute centers
     centers = []
-    for i in range(clustering_model.n_clusters_):
+    for i in range(len(classes)):
+        centers.append(embeddings[new_labels == i].mean(axis=0, keepdim=True))
+    centers = torch.cat(centers)
+    return centers, new_labels
+
+
+def run_hdbscan_cluster(embeddings):
+    import hdbscan
+
+    np.random.seed(0)
+    clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
+    labels = torch.from_numpy(clusterer.fit_predict(embeddings))
+
+    # Sort labels
+    classes, counts = np.unique(labels, return_counts=True)
+    indices = np.argsort(counts)[::-1]
+    classes = [classes[i] for i in indices]
+    new_labels = torch.empty_like(labels)
+    for i, c in enumerate(classes):
+        new_labels[labels == c] = i
+
+    # Compute centers
+    centers = []
+    for i in range(len(classes)):
         centers.append(embeddings[new_labels == i].mean(axis=0, keepdim=True))
     centers = torch.cat(centers)
     return centers, new_labels
@@ -183,7 +206,10 @@ def get_cluster_info(texts, labels, topk_indices):
     parser.add_argument("--english-only", action="store_true")
     parser.add_argument("--num-clusters", type=int, default=20)
     parser.add_argument(
-        "--cluster-alg", type=str, choices=["kmeans", "aggcls"], default="kmeans"
+        "--cluster-alg",
+        type=str,
+        choices=["kmeans", "aggcls", "HDBSCAN"],
+        default="kmeans",
     )
     parser.add_argument("--show-top-k", type=int, default=200)
     parser.add_argument("--show-cut-off", type=int, default=512)
@@ -203,6 +229,8 @@ def get_cluster_info(texts, labels, topk_indices):
         centers, labels = run_k_means(embeddings, num_clusters)
     elif args.cluster_alg == "aggcls":
         centers, labels = run_agg_cluster(embeddings, num_clusters)
+    elif args.cluster_alg == "HDBSCAN":
+        centers, labels = run_hdbscan_cluster(embeddings)
     else:
         raise ValueError(f"Invalid clustering algorithm: {args.cluster_alg}")
 

From 94f4dd68bb62229c0b4d27665feb22656d3b90f7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 5 Sep 2023 19:57:00 -0700
Subject: [PATCH 08/45] Revert "add best_of and use_beam_search for completions
 interface" (#2370)

---
 fastchat/protocol/api_protocol.py        |  2 +-
 fastchat/protocol/openai_api_protocol.py |  4 +-
 fastchat/serve/openai_api_server.py      | 29 +---------
 fastchat/serve/vllm_worker.py            | 70 +++++++-----------------
 4 files changed, 26 insertions(+), 79 deletions(-)

diff --git a/fastchat/protocol/api_protocol.py b/fastchat/protocol/api_protocol.py
index 1091f5e5a..7dc8fe1c3 100644
--- a/fastchat/protocol/api_protocol.py
+++ b/fastchat/protocol/api_protocol.py
@@ -150,7 +150,7 @@ class CompletionResponse(BaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[CompletionResponseChoice]
-    usage: Union[UsageInfo, List[UsageInfo]]
+    usage: UsageInfo
 
 
 class CompletionResponseStreamChoice(BaseModel):
diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py
index fc3c91ebd..6232e8b9b 100644
--- a/fastchat/protocol/openai_api_protocol.py
+++ b/fastchat/protocol/openai_api_protocol.py
@@ -151,13 +151,11 @@ class CompletionRequest(BaseModel):
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     user: Optional[str] = None
-    use_beam_search: Optional[bool] = False
-    best_of: Optional[int] = None
 
 
 class CompletionResponseChoice(BaseModel):
     index: int
-    text: Union[str, List[str]]
+    text: str
     logprobs: Optional[int] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
 
diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index 1344ec46f..02e8481f4 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -238,12 +238,9 @@ async def get_gen_params(
     *,
     temperature: float,
     top_p: float,
-    best_of: Optional[int],
     max_tokens: Optional[int],
-    n: Optional[int],
     echo: Optional[bool],
     stop: Optional[Union[str, List[str]]],
-    use_beam_search: Optional[bool],
 ) -> Dict[str, Any]:
     conv = await get_conv(model_name, worker_addr)
     conv = Conversation(
@@ -290,11 +287,6 @@ async def get_gen_params(
         "stop_token_ids": conv.stop_token_ids,
     }
 
-    if best_of:
-        gen_params.update({"n": n, "best_of": best_of})
-    if use_beam_search is not None:
-        gen_params.update({"use_beam_search": use_beam_search})
-
     new_stop = set()
     _add_to_set(stop, new_stop)
     _add_to_set(conv.stop_str, new_stop)
@@ -499,21 +491,15 @@ async def create_completion(request: CompletionRequest):
                 text,
                 temperature=request.temperature,
                 top_p=request.top_p,
-                best_of=request.best_of,
                 max_tokens=request.max_tokens,
-                n=request.n,
                 echo=request.echo,
                 stop=request.stop,
-                use_beam_search=request.use_beam_search,
             )
             for i in range(request.n):
                 content = asyncio.create_task(
                     generate_completion(gen_params, worker_addr)
                 )
                 text_completions.append(content)
-                # when use with best_of, only need send one request
-                if request.best_of:
-                    break
 
         try:
             all_tasks = await asyncio.gather(*text_completions)
@@ -533,18 +519,9 @@ async def create_completion(request: CompletionRequest):
                     finish_reason=content.get("finish_reason", "stop"),
                 )
             )
-            idx = 0
-            while True:
-                info = content["usage"]
-                if isinstance(info, list):
-                    info = info[idx]
-
-                task_usage = UsageInfo.parse_obj(info)
-
-                for usage_key, usage_value in task_usage.dict().items():
-                    setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
-                idx += 1
-                break
+            task_usage = UsageInfo.parse_obj(content["usage"])
+            for usage_key, usage_value in task_usage.dict().items():
+                setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
         return CompletionResponse(
             model=request.model, choices=choices, usage=UsageInfo.parse_obj(usage)
diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index 71a30f890..8e255b79c 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -18,7 +18,6 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
-from fastchat.constants import ErrorCode, SERVER_ERROR_MSG
 from fastchat.serve.model_worker import (
     BaseModelWorker,
     logger,
@@ -75,9 +74,6 @@ async def generate_stream(self, params):
         if self.tokenizer.eos_token_id is not None:
             stop_token_ids.append(self.tokenizer.eos_token_id)
         echo = params.get("echo", True)
-        use_beam_search = params.get("use_beam_search", False)
-        best_of = params.get("best_of", None)
-        n = params.get("n", 1)
 
         # Handle stop_str
         stop = set()
@@ -94,51 +90,27 @@ async def generate_stream(self, params):
         top_p = max(top_p, 1e-5)
         if temperature <= 1e-5:
             top_p = 1.0
-        try:
-            sampling_params = SamplingParams(
-                n=n,
-                temperature=temperature,
-                top_p=top_p,
-                use_beam_search=use_beam_search,
-                stop=list(stop),
-                max_tokens=max_new_tokens,
-                best_of=best_of,
-            )
-
-            results_generator = engine.generate(context, sampling_params, request_id)
-
-            async for request_output in results_generator:
-                prompt = request_output.prompt
-                prompt_tokens = len(request_output.prompt_token_ids)
-                output_usage = []
-                for out in request_output.outputs:
-                    completion_tokens = len(out.token_ids)
-                    total_tokens = prompt_tokens + completion_tokens
-                    output_usage.append(
-                        {
-                            "prompt_tokens": prompt_tokens,
-                            "completion_tokens": completion_tokens,
-                            "total_tokens": total_tokens,
-                        }
-                    )
-
-                if echo:
-                    text_outputs = [
-                        prompt + output.text for output in request_output.outputs
-                    ]
-                else:
-                    text_outputs = [output.text for output in request_output.outputs]
-
-                if sampling_params.best_of is None:
-                    text_outputs = [" ".join(text_outputs)]
-                ret = {"text": text_outputs, "error_code": 0, "usage": output_usage}
-                yield (json.dumps(ret) + "\0").encode()
-        except (ValueError, RuntimeError) as e:
-            ret = {
-                "text": f"{e}",
-                "error_code": ErrorCode.PARAM_OUT_OF_RANGE,
-                "usage": {},
-            }
+        sampling_params = SamplingParams(
+            n=1,
+            temperature=temperature,
+            top_p=top_p,
+            use_beam_search=False,
+            stop=list(stop),
+            max_tokens=max_new_tokens,
+        )
+        results_generator = engine.generate(context, sampling_params, request_id)
+
+        async for request_output in results_generator:
+            prompt = request_output.prompt
+            if echo:
+                text_outputs = [
+                    prompt + output.text for output in request_output.outputs
+                ]
+            else:
+                text_outputs = [output.text for output in request_output.outputs]
+            text_outputs = " ".join(text_outputs)
+            # Note: usage is not supported yet
+            ret = {"text": text_outputs, "error_code": 0, "usage": {}}
             yield (json.dumps(ret) + "\0").encode()
 
     async def generate(self, params):

From dc3dd120c8a68ed9b25509aebe1a1190a2aab838 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 5 Sep 2023 20:02:11 -0700
Subject: [PATCH 09/45] Improve doc (#2371)

---
 docs/commands/test_process.md          |  3 +++
 tests/launch_openai_api_test_server.py |  2 +-
 tests/test_cli.py                      |  5 ++---
 tests/test_openai_api.py               | 10 +++++-----
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/docs/commands/test_process.md b/docs/commands/test_process.md
index 642ffaa02..804717556 100644
--- a/docs/commands/test_process.md
+++ b/docs/commands/test_process.md
@@ -1,3 +1,6 @@
+## Unit tests for FastChat
+The scripts are under [FastChat/tests](../../tests).
+
 ### Test CLI Inference
 
 ```
diff --git a/tests/launch_openai_api_test_server.py b/tests/launch_openai_api_test_server.py
index ae21869a2..a58570fd6 100644
--- a/tests/launch_openai_api_test_server.py
+++ b/tests/launch_openai_api_test_server.py
@@ -13,7 +13,7 @@ def launch_process(cmd):
     launch_process("python3 -m fastchat.serve.openai_api_server")
 
     models = [
-        "lmsys/vicuna-7b-v1.3",
+        "lmsys/vicuna-7b-v1.5",
         "lmsys/fastchat-t5-3b-v1.0",
         "THUDM/chatglm-6b",
         "mosaicml/mpt-7b-chat",
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4b8dbcc19..dcefa4bbe 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -7,14 +7,13 @@
 
 def test_single_gpu():
     models = [
-        "lmsys/vicuna-7b-v1.3",
+        "lmsys/vicuna-7b-v1.5",
         "lmsys/longchat-7b-16k",
         "lmsys/fastchat-t5-3b-v1.0",
+        "meta-llama/Llama-2-7b-chat-hf",
         "THUDM/chatglm-6b",
         "THUDM/chatglm2-6b",
         "mosaicml/mpt-7b-chat",
-        "project-baize/baize-v2-7b",
-        "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b",
         "tiiuae/falcon-7b-instruct",
         "~/model_weights/alpaca-7b",
         "~/model_weights/RWKV-4-Raven-7B-v11x-Eng99%-Other1%-20230429-ctx8192.pth",
diff --git a/tests/test_openai_api.py b/tests/test_openai_api.py
index 87e8af4ec..f291b90a3 100644
--- a/tests/test_openai_api.py
+++ b/tests/test_openai_api.py
@@ -59,7 +59,7 @@ def test_chat_completion_stream(model):
     print()
 
 
-def test_openai_curl(model):
+def test_openai_curl():
     run_cmd("curl http://localhost:8000/v1/models")
 
     run_cmd(
@@ -67,7 +67,7 @@ def test_openai_curl(model):
 curl http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "messages": [{"role": "user", "content": "Hello! What is your name?"}]
   }'
 """
@@ -78,7 +78,7 @@ def test_openai_curl(model):
 curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "prompt": "Once upon a time",
     "max_tokens": 41,
     "temperature": 0.5
@@ -91,7 +91,7 @@ def test_openai_curl(model):
 curl http://localhost:8000/v1/embeddings \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "input": "Hello world!"
   }'
 """
@@ -111,4 +111,4 @@ def test_openai_curl(model):
         test_chat_completion_stream(model)
 
     print("===== Test curl =====")
-    test_openai_curl("vicuna-7b-v1.3")
+    test_openai_curl()

From a5e6abf6c305ba0aca11a7fd77247a64c68359df Mon Sep 17 00:00:00 2001
From: leiwen83 <leiwen83@users.noreply.github.com>
Date: Thu, 7 Sep 2023 10:50:39 +0800
Subject: [PATCH 10/45] add best_of and use_beam_search for completions
 interface (#2372)

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
---
 fastchat/protocol/api_protocol.py        |  2 +-
 fastchat/protocol/openai_api_protocol.py |  4 +-
 fastchat/serve/openai_api_server.py      | 29 +++++++++-
 fastchat/serve/vllm_worker.py            | 70 +++++++++++++++++-------
 4 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/fastchat/protocol/api_protocol.py b/fastchat/protocol/api_protocol.py
index 7dc8fe1c3..1091f5e5a 100644
--- a/fastchat/protocol/api_protocol.py
+++ b/fastchat/protocol/api_protocol.py
@@ -150,7 +150,7 @@ class CompletionResponse(BaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[CompletionResponseChoice]
-    usage: UsageInfo
+    usage: Union[UsageInfo, List[UsageInfo]]
 
 
 class CompletionResponseStreamChoice(BaseModel):
diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py
index 6232e8b9b..fc3c91ebd 100644
--- a/fastchat/protocol/openai_api_protocol.py
+++ b/fastchat/protocol/openai_api_protocol.py
@@ -151,11 +151,13 @@ class CompletionRequest(BaseModel):
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     user: Optional[str] = None
+    use_beam_search: Optional[bool] = False
+    best_of: Optional[int] = None
 
 
 class CompletionResponseChoice(BaseModel):
     index: int
-    text: str
+    text: Union[str, List[str]]
     logprobs: Optional[int] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
 
diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index 02e8481f4..e399345d8 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -241,6 +241,9 @@ async def get_gen_params(
     max_tokens: Optional[int],
     echo: Optional[bool],
     stop: Optional[Union[str, List[str]]],
+    best_of: Optional[int] = None,
+    n: Optional[int] = 1,
+    use_beam_search: Optional[bool] = None,
 ) -> Dict[str, Any]:
     conv = await get_conv(model_name, worker_addr)
     conv = Conversation(
@@ -287,6 +290,11 @@ async def get_gen_params(
         "stop_token_ids": conv.stop_token_ids,
     }
 
+    if best_of is not None:
+        gen_params.update({"n": n, "best_of": best_of})
+    if use_beam_search is not None:
+        gen_params.update({"use_beam_search": use_beam_search})
+
     new_stop = set()
     _add_to_set(stop, new_stop)
     _add_to_set(conv.stop_str, new_stop)
@@ -494,12 +502,18 @@ async def create_completion(request: CompletionRequest):
                 max_tokens=request.max_tokens,
                 echo=request.echo,
                 stop=request.stop,
+                best_of=request.best_of,
+                n=request.n,
+                use_beam_search=request.use_beam_search,
             )
             for i in range(request.n):
                 content = asyncio.create_task(
                     generate_completion(gen_params, worker_addr)
                 )
                 text_completions.append(content)
+                # when use with best_of, only need send one request
+                if request.best_of:
+                    break
 
         try:
             all_tasks = await asyncio.gather(*text_completions)
@@ -519,9 +533,18 @@ async def create_completion(request: CompletionRequest):
                     finish_reason=content.get("finish_reason", "stop"),
                 )
             )
-            task_usage = UsageInfo.parse_obj(content["usage"])
-            for usage_key, usage_value in task_usage.dict().items():
-                setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
+            idx = 0
+            while True:
+                info = content["usage"]
+                if isinstance(info, list):
+                    info = info[idx]
+
+                task_usage = UsageInfo.parse_obj(info)
+
+                for usage_key, usage_value in task_usage.dict().items():
+                    setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
+                idx += 1
+                break
 
         return CompletionResponse(
             model=request.model, choices=choices, usage=UsageInfo.parse_obj(usage)
diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index 8e255b79c..71a30f890 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -18,6 +18,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
+from fastchat.constants import ErrorCode, SERVER_ERROR_MSG
 from fastchat.serve.model_worker import (
     BaseModelWorker,
     logger,
@@ -74,6 +75,9 @@ async def generate_stream(self, params):
         if self.tokenizer.eos_token_id is not None:
             stop_token_ids.append(self.tokenizer.eos_token_id)
         echo = params.get("echo", True)
+        use_beam_search = params.get("use_beam_search", False)
+        best_of = params.get("best_of", None)
+        n = params.get("n", 1)
 
         # Handle stop_str
         stop = set()
@@ -90,27 +94,51 @@ async def generate_stream(self, params):
         top_p = max(top_p, 1e-5)
         if temperature <= 1e-5:
             top_p = 1.0
-        sampling_params = SamplingParams(
-            n=1,
-            temperature=temperature,
-            top_p=top_p,
-            use_beam_search=False,
-            stop=list(stop),
-            max_tokens=max_new_tokens,
-        )
-        results_generator = engine.generate(context, sampling_params, request_id)
-
-        async for request_output in results_generator:
-            prompt = request_output.prompt
-            if echo:
-                text_outputs = [
-                    prompt + output.text for output in request_output.outputs
-                ]
-            else:
-                text_outputs = [output.text for output in request_output.outputs]
-            text_outputs = " ".join(text_outputs)
-            # Note: usage is not supported yet
-            ret = {"text": text_outputs, "error_code": 0, "usage": {}}
+        try:
+            sampling_params = SamplingParams(
+                n=n,
+                temperature=temperature,
+                top_p=top_p,
+                use_beam_search=use_beam_search,
+                stop=list(stop),
+                max_tokens=max_new_tokens,
+                best_of=best_of,
+            )
+
+            results_generator = engine.generate(context, sampling_params, request_id)
+
+            async for request_output in results_generator:
+                prompt = request_output.prompt
+                prompt_tokens = len(request_output.prompt_token_ids)
+                output_usage = []
+                for out in request_output.outputs:
+                    completion_tokens = len(out.token_ids)
+                    total_tokens = prompt_tokens + completion_tokens
+                    output_usage.append(
+                        {
+                            "prompt_tokens": prompt_tokens,
+                            "completion_tokens": completion_tokens,
+                            "total_tokens": total_tokens,
+                        }
+                    )
+
+                if echo:
+                    text_outputs = [
+                        prompt + output.text for output in request_output.outputs
+                    ]
+                else:
+                    text_outputs = [output.text for output in request_output.outputs]
+
+                if sampling_params.best_of is None:
+                    text_outputs = [" ".join(text_outputs)]
+                ret = {"text": text_outputs, "error_code": 0, "usage": output_usage}
+                yield (json.dumps(ret) + "\0").encode()
+        except (ValueError, RuntimeError) as e:
+            ret = {
+                "text": f"{e}",
+                "error_code": ErrorCode.PARAM_OUT_OF_RANGE,
+                "usage": {},
+            }
             yield (json.dumps(ret) + "\0").encode()
 
     async def generate(self, params):

From 1d703b2f63a3a166437219df481f68da8c802de9 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 6 Sep 2023 20:54:31 -0700
Subject: [PATCH 11/45] update monkey patch for llama2 (#2379)

---
 fastchat/train/train_mem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastchat/train/train_mem.py b/fastchat/train/train_mem.py
index e4b335284..9ce4913aa 100644
--- a/fastchat/train/train_mem.py
+++ b/fastchat/train/train_mem.py
@@ -1,7 +1,7 @@
 # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 
 # Need to call this before importing transformers.
-from fastchat.train.llama_flash_attn_monkey_patch import (
+from fastchat.train.llama2_flash_attn_monkey_patch import (
     replace_llama_attn_with_flash_attn,
 )
 

From 56744d1d947ad7cc94763e911529756b17139505 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 6 Sep 2023 23:29:27 -0700
Subject: [PATCH 12/45] Make E5 adapter more restrict to reduce mismatch
 (#2381)

---
 fastchat/model/model_adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 8c2fbde32..c1e2b2163 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -1405,7 +1405,7 @@ class E5Adapter(BaseModelAdapter):
     use_fast_tokenizer = False
 
     def match(self, model_path: str):
-        return "e5" in model_path.lower()
+        return "e5-" in model_path.lower()
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         revision = from_pretrained_kwargs.get("revision", "main")

From 6af0a7c8a618f87e05151dcf4939766c8aa59754 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 8 Sep 2023 15:21:18 -0700
Subject: [PATCH 13/45] Update UI and sponsers (#2387)

---
 fastchat/serve/gradio_block_arena_anony.py | 6 +++---
 fastchat/serve/gradio_block_arena_named.py | 4 ++--
 fastchat/serve/gradio_web_server.py        | 4 ++--
 fastchat/serve/monitor/monitor.py          | 9 +++++++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index 978f76b75..a598a8c9a 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -196,7 +196,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "chatglm-6b": 0.5,
 }
 
-SAMPLING_BOOST_MODELS = ["llama-2-70b-chat", "codellama-34b-instruct"]
+SAMPLING_BOOST_MODELS = ["wizardlm-70b"]
 
 model_pairs = []
 model_pairs_weights = []
@@ -420,12 +420,12 @@ def build_side_by_side_ui_anony(models):
         with gr.Column(scale=20):
             textbox = gr.Textbox(
                 show_label=False,
-                placeholder="Enter text and press ENTER",
+                placeholder="Enter your prompt here and press ENTER",
                 visible=False,
                 container=False,
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False)
+            send_btn = gr.Button(value="Battle", visible=False, variant="primary")
 
     with gr.Row() as button_row2:
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
index b26172f3e..c031d28c2 100644
--- a/fastchat/serve/gradio_block_arena_named.py
+++ b/fastchat/serve/gradio_block_arena_named.py
@@ -352,12 +352,12 @@ def build_side_by_side_ui_named(models):
         with gr.Column(scale=20):
             textbox = gr.Textbox(
                 show_label=False,
-                placeholder="Enter text and press ENTER",
+                placeholder="Enter your prompt here and press ENTER",
                 visible=False,
                 container=False,
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False)
+            send_btn = gr.Button(value="Battle", visible=False, variant="primary")
 
     with gr.Row() as button_row2:
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
index c2e22e562..2fae670dc 100644
--- a/fastchat/serve/gradio_web_server.py
+++ b/fastchat/serve/gradio_web_server.py
@@ -591,12 +591,12 @@ def build_single_model_ui(models, add_promotion_links=False):
         with gr.Column(scale=20):
             textbox = gr.Textbox(
                 show_label=False,
-                placeholder="Enter text and press ENTER",
+                placeholder="Enter your prompt here and press ENTER",
                 visible=False,
                 container=False,
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False)
+            send_btn = gr.Button(value="Send", visible=False, variant="primary")
 
     with gr.Row(visible=False) as button_row:
         upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
index 395f2bf84..b2081bc0d 100644
--- a/fastchat/serve/monitor/monitor.py
+++ b/fastchat/serve/monitor/monitor.py
@@ -30,11 +30,11 @@ def make_leaderboard_md(elo_results):
 | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 
 🏆 This leaderboard is based on the following three benchmarks.
-- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 50K+ user votes to compute Elo ratings.
+- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 70K+ user votes to compute Elo ratings.
 - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
 - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
 
-💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
+💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023.
 """
     return leaderboard_md
 
@@ -241,6 +241,11 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
                 "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
             )
             plot_4 = gr.Plot(p4, show_label=False)
+
+    from fastchat.serve.gradio_web_server import acknowledgment_md
+
+    gr.Markdown(acknowledgment_md)
+
     return [md_1, plot_1, plot_2, plot_3, plot_4]
 
 

From 9b3147e885adc30552e150f530d9d47c9b919805 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 9 Sep 2023 21:44:09 -0700
Subject: [PATCH 14/45] Use fsdp api for save save (#2390)

---
 fastchat/train/train.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fastchat/train/train.py b/fastchat/train/train.py
index a2c461d78..89dff81dd 100644
--- a/fastchat/train/train.py
+++ b/fastchat/train/train.py
@@ -69,13 +69,15 @@ def rank0_print(*args):
         print(*args)
 
 
-def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
-    """Collects the state dict and dump to disk."""
-    state_dict = trainer.model.state_dict()
-    if trainer.args.should_save:
-        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
-        del state_dict
-        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+def trainer_save_model_safe(trainer: transformers.Trainer):
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    from torch.distributed.fsdp import StateDictType, FullStateDictConfig
+
+    save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+    with FSDP.state_dict_type(
+        trainer.model, StateDictType.FULL_STATE_DICT, save_policy
+    ):
+        trainer.save_model()
 
 
 def preprocess(
@@ -279,9 +281,11 @@ def train():
         trainer.train(resume_from_checkpoint=True)
     else:
         trainer.train()
+
+    # Save model
     model.config.use_cache = True
     trainer.save_state()
-    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+    trainer_save_model_safe(trainer)
 
 
 if __name__ == "__main__":

From a6167dbc3014f5e9d80c8d5bbf98757abfdaa0e7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 10 Sep 2023 04:45:41 +0000
Subject: [PATCH 15/45] Release v0.2.27

---
 fastchat/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastchat/__init__.py b/fastchat/__init__.py
index 3b9e925d1..be2d7c2ff 100644
--- a/fastchat/__init__.py
+++ b/fastchat/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.26"
+__version__ = "0.2.27"
diff --git a/pyproject.toml b/pyproject.toml
index 6c1d12f5e..73dbdd8da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fschat"
-version = "0.2.26"
+version = "0.2.27"
 description = "An open platform for training, serving, and evaluating large language model based chatbots."
 readme = "README.md"
 requires-python = ">=3.8"

From 7dcdafe936d2ff95863f894206dc56d4b86ff01d Mon Sep 17 00:00:00 2001
From: Jon Durbin <jon@jondurbin.com>
Date: Mon, 11 Sep 2023 13:51:51 -0400
Subject: [PATCH 16/45] Spicyboros + airoboros 2.2 template update. (#2392)

Co-authored-by: Jon Durbin <jon.durbin@onna.com>
---
 fastchat/conversation.py         | 11 +++++++++++
 fastchat/model/model_adapter.py  |  7 ++++++-
 fastchat/model/model_registry.py | 20 +++++++++++++++-----
 3 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index f733be68a..73fb541f1 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -357,6 +357,17 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+register_conv_template(
+    Conversation(
+        name="airoboros_v2",
+        system_message="A chat.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+
 # Koala default template
 register_conv_template(
     Conversation(
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index c1e2b2163..f018c212e 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -2,6 +2,7 @@
 
 import math
 import os
+import re
 import sys
 from typing import Dict, List, Optional
 import warnings
@@ -561,9 +562,13 @@ class AiroborosAdapter(BaseModelAdapter):
     """The model adapter for jondurbin/airoboros-*"""
 
     def match(self, model_path: str):
-        return "airoboros" in model_path.lower()
+        if re.search(r"airoboros|spicyboros", model_path, re.I):
+            return True
+        return False
 
     def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "spicyboros" in model_path or re.search(r"-(2\.[2-9]+)", model_path):
+            return get_conv_template("airoboros_v2")
         return get_conv_template("airoboros_v1")
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py
index 92938abc9..aaf7e5e5f 100644
--- a/fastchat/model/model_registry.py
+++ b/fastchat/model/model_registry.py
@@ -214,15 +214,25 @@ def get_model_info(name: str) -> ModelInfo:
 )
 register_model_info(
     [
-        "airoboros-7b-gpt4-1.4",
-        "airoboros-13b-gpt4-1.4",
-        "airoboros-33b-gpt4-1.4",
-        "airoboros-65b-gpt4-1.4",
+        "airoboros-l2-7b-2.1",
+        "airoboros-l2-13b-2.1",
+        "airoboros-c34b-2.1",
+        "airoboros-l2-70b-2.1",
     ],
     "airoboros",
-    "https://huggingface.co/jondurbin/airoboros-33b-gpt4-1.4",
+    "https://huggingface.co/jondurbin/airoboros-l2-70b-2.1",
     "an instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4",
 )
+register_model_info(
+    [
+        "spicyboros-7b-2.2",
+        "spicyboros-13b-2.2",
+        "spicyboros-70b-2.2",
+    ],
+    "spicyboros",
+    "https://huggingface.co/jondurbin/spicyboros-70b-2.2",
+    "de-aligned versions of the airoboros models",
+)
 register_model_info(
     ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"],
     "Robin-v2",

From b921f1616f1153d9706eaf989ea35fd11eeae518 Mon Sep 17 00:00:00 2001
From: Rayrtfr <2384172887@qq.com>
Date: Tue, 12 Sep 2023 01:56:34 +0800
Subject: [PATCH 17/45] bugfix of openai_api_server for
 fastchat.serve.vllm_worker (#2398)

Co-authored-by: wuyongyu <wuyongyu@atomecho.xyz>
---
 fastchat/serve/openai_api_server.py | 54 +++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index e399345d8..5ffcc0448 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -399,15 +399,28 @@ async def create_chat_completion(request: ChatCompletionRequest):
     for i, content in enumerate(all_tasks):
         if content["error_code"] != 0:
             return create_error_response(content["error_code"], content["text"])
-        choices.append(
-            ChatCompletionResponseChoice(
-                index=i,
-                message=ChatMessage(role="assistant", content=content["text"]),
-                finish_reason=content.get("finish_reason", "stop"),
+        if isinstance(content["text"], list):
+            for t in content["text"]:
+                choices.append(
+                    ChatCompletionResponseChoice(
+                        index=i,
+                        message=ChatMessage(role="assistant", content=t),
+                        finish_reason=content.get("finish_reason", "stop"),
+                    )
+                )
+        else:
+            choices.append(
+                ChatCompletionResponseChoice(
+                    index=i,
+                    message=ChatMessage(role="assistant", content=content["text"]),
+                    finish_reason=content.get("finish_reason", "stop"),
+                )
             )
-        )
         if "usage" in content:
-            task_usage = UsageInfo.parse_obj(content["usage"])
+            if isinstance(content["usage"], list):
+                task_usage = UsageInfo.parse_obj(content["usage"][0])
+            else:
+                task_usage = UsageInfo.parse_obj(content["usage"])
             for usage_key, usage_value in task_usage.dict().items():
                 setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
@@ -775,14 +788,27 @@ async def create_chat_completion(request: APIChatCompletionRequest):
     for i, content in enumerate(all_tasks):
         if content["error_code"] != 0:
             return create_error_response(content["error_code"], content["text"])
-        choices.append(
-            ChatCompletionResponseChoice(
-                index=i,
-                message=ChatMessage(role="assistant", content=content["text"]),
-                finish_reason=content.get("finish_reason", "stop"),
+        if isinstance(content["text"], list):
+            for t in content["text"]:
+                choices.append(
+                    ChatCompletionResponseChoice(
+                        index=i,
+                        message=ChatMessage(role="assistant", content=t),
+                        finish_reason=content.get("finish_reason", "stop"),
+                    )
+                )
+        else:
+            choices.append(
+                ChatCompletionResponseChoice(
+                    index=i,
+                    message=ChatMessage(role="assistant", content=content["text"]),
+                    finish_reason=content.get("finish_reason", "stop"),
+                )
             )
-        )
-        task_usage = UsageInfo.parse_obj(content["usage"])
+        if isinstance(content["usage"], list):
+            task_usage = UsageInfo.parse_obj(content["usage"][0])
+        else:
+            task_usage = UsageInfo.parse_obj(content["usage"])
         for usage_key, usage_value in task_usage.dict().items():
             setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 

From 13f40b39bb4abef7984bf1314a7eb087150f1183 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 11 Sep 2023 16:11:32 -0700
Subject: [PATCH 18/45] Revert "bugfix of openai_api_server for
 fastchat.serve.vllm_worker" (#2400)

---
 fastchat/serve/openai_api_server.py | 54 ++++++++---------------------
 1 file changed, 14 insertions(+), 40 deletions(-)

diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index 5ffcc0448..e399345d8 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -399,28 +399,15 @@ async def create_chat_completion(request: ChatCompletionRequest):
     for i, content in enumerate(all_tasks):
         if content["error_code"] != 0:
             return create_error_response(content["error_code"], content["text"])
-        if isinstance(content["text"], list):
-            for t in content["text"]:
-                choices.append(
-                    ChatCompletionResponseChoice(
-                        index=i,
-                        message=ChatMessage(role="assistant", content=t),
-                        finish_reason=content.get("finish_reason", "stop"),
-                    )
-                )
-        else:
-            choices.append(
-                ChatCompletionResponseChoice(
-                    index=i,
-                    message=ChatMessage(role="assistant", content=content["text"]),
-                    finish_reason=content.get("finish_reason", "stop"),
-                )
+        choices.append(
+            ChatCompletionResponseChoice(
+                index=i,
+                message=ChatMessage(role="assistant", content=content["text"]),
+                finish_reason=content.get("finish_reason", "stop"),
             )
+        )
         if "usage" in content:
-            if isinstance(content["usage"], list):
-                task_usage = UsageInfo.parse_obj(content["usage"][0])
-            else:
-                task_usage = UsageInfo.parse_obj(content["usage"])
+            task_usage = UsageInfo.parse_obj(content["usage"])
             for usage_key, usage_value in task_usage.dict().items():
                 setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
@@ -788,27 +775,14 @@ async def create_chat_completion(request: APIChatCompletionRequest):
     for i, content in enumerate(all_tasks):
         if content["error_code"] != 0:
             return create_error_response(content["error_code"], content["text"])
-        if isinstance(content["text"], list):
-            for t in content["text"]:
-                choices.append(
-                    ChatCompletionResponseChoice(
-                        index=i,
-                        message=ChatMessage(role="assistant", content=t),
-                        finish_reason=content.get("finish_reason", "stop"),
-                    )
-                )
-        else:
-            choices.append(
-                ChatCompletionResponseChoice(
-                    index=i,
-                    message=ChatMessage(role="assistant", content=content["text"]),
-                    finish_reason=content.get("finish_reason", "stop"),
-                )
+        choices.append(
+            ChatCompletionResponseChoice(
+                index=i,
+                message=ChatMessage(role="assistant", content=content["text"]),
+                finish_reason=content.get("finish_reason", "stop"),
             )
-        if isinstance(content["usage"], list):
-            task_usage = UsageInfo.parse_obj(content["usage"][0])
-        else:
-            task_usage = UsageInfo.parse_obj(content["usage"])
+        )
+        task_usage = UsageInfo.parse_obj(content["usage"])
         for usage_key, usage_value in task_usage.dict().items():
             setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 

From 77aa4df4a1204e1b0b863e1c03f0ef6377d6c476 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 11 Sep 2023 16:11:51 -0700
Subject: [PATCH 19/45] Revert "add best_of and use_beam_search for completions
 interface" (#2401)

---
 fastchat/protocol/api_protocol.py        |  2 +-
 fastchat/protocol/openai_api_protocol.py |  4 +-
 fastchat/serve/openai_api_server.py      | 29 +---------
 fastchat/serve/vllm_worker.py            | 70 +++++++-----------------
 4 files changed, 26 insertions(+), 79 deletions(-)

diff --git a/fastchat/protocol/api_protocol.py b/fastchat/protocol/api_protocol.py
index 1091f5e5a..7dc8fe1c3 100644
--- a/fastchat/protocol/api_protocol.py
+++ b/fastchat/protocol/api_protocol.py
@@ -150,7 +150,7 @@ class CompletionResponse(BaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[CompletionResponseChoice]
-    usage: Union[UsageInfo, List[UsageInfo]]
+    usage: UsageInfo
 
 
 class CompletionResponseStreamChoice(BaseModel):
diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py
index fc3c91ebd..6232e8b9b 100644
--- a/fastchat/protocol/openai_api_protocol.py
+++ b/fastchat/protocol/openai_api_protocol.py
@@ -151,13 +151,11 @@ class CompletionRequest(BaseModel):
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     user: Optional[str] = None
-    use_beam_search: Optional[bool] = False
-    best_of: Optional[int] = None
 
 
 class CompletionResponseChoice(BaseModel):
     index: int
-    text: Union[str, List[str]]
+    text: str
     logprobs: Optional[int] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
 
diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index e399345d8..02e8481f4 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -241,9 +241,6 @@ async def get_gen_params(
     max_tokens: Optional[int],
     echo: Optional[bool],
     stop: Optional[Union[str, List[str]]],
-    best_of: Optional[int] = None,
-    n: Optional[int] = 1,
-    use_beam_search: Optional[bool] = None,
 ) -> Dict[str, Any]:
     conv = await get_conv(model_name, worker_addr)
     conv = Conversation(
@@ -290,11 +287,6 @@ async def get_gen_params(
         "stop_token_ids": conv.stop_token_ids,
     }
 
-    if best_of is not None:
-        gen_params.update({"n": n, "best_of": best_of})
-    if use_beam_search is not None:
-        gen_params.update({"use_beam_search": use_beam_search})
-
     new_stop = set()
     _add_to_set(stop, new_stop)
     _add_to_set(conv.stop_str, new_stop)
@@ -502,18 +494,12 @@ async def create_completion(request: CompletionRequest):
                 max_tokens=request.max_tokens,
                 echo=request.echo,
                 stop=request.stop,
-                best_of=request.best_of,
-                n=request.n,
-                use_beam_search=request.use_beam_search,
             )
             for i in range(request.n):
                 content = asyncio.create_task(
                     generate_completion(gen_params, worker_addr)
                 )
                 text_completions.append(content)
-                # when use with best_of, only need send one request
-                if request.best_of:
-                    break
 
         try:
             all_tasks = await asyncio.gather(*text_completions)
@@ -533,18 +519,9 @@ async def create_completion(request: CompletionRequest):
                     finish_reason=content.get("finish_reason", "stop"),
                 )
             )
-            idx = 0
-            while True:
-                info = content["usage"]
-                if isinstance(info, list):
-                    info = info[idx]
-
-                task_usage = UsageInfo.parse_obj(info)
-
-                for usage_key, usage_value in task_usage.dict().items():
-                    setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
-                idx += 1
-                break
+            task_usage = UsageInfo.parse_obj(content["usage"])
+            for usage_key, usage_value in task_usage.dict().items():
+                setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
         return CompletionResponse(
             model=request.model, choices=choices, usage=UsageInfo.parse_obj(usage)
diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index 71a30f890..8e255b79c 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -18,7 +18,6 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 
-from fastchat.constants import ErrorCode, SERVER_ERROR_MSG
 from fastchat.serve.model_worker import (
     BaseModelWorker,
     logger,
@@ -75,9 +74,6 @@ async def generate_stream(self, params):
         if self.tokenizer.eos_token_id is not None:
             stop_token_ids.append(self.tokenizer.eos_token_id)
         echo = params.get("echo", True)
-        use_beam_search = params.get("use_beam_search", False)
-        best_of = params.get("best_of", None)
-        n = params.get("n", 1)
 
         # Handle stop_str
         stop = set()
@@ -94,51 +90,27 @@ async def generate_stream(self, params):
         top_p = max(top_p, 1e-5)
         if temperature <= 1e-5:
             top_p = 1.0
-        try:
-            sampling_params = SamplingParams(
-                n=n,
-                temperature=temperature,
-                top_p=top_p,
-                use_beam_search=use_beam_search,
-                stop=list(stop),
-                max_tokens=max_new_tokens,
-                best_of=best_of,
-            )
-
-            results_generator = engine.generate(context, sampling_params, request_id)
-
-            async for request_output in results_generator:
-                prompt = request_output.prompt
-                prompt_tokens = len(request_output.prompt_token_ids)
-                output_usage = []
-                for out in request_output.outputs:
-                    completion_tokens = len(out.token_ids)
-                    total_tokens = prompt_tokens + completion_tokens
-                    output_usage.append(
-                        {
-                            "prompt_tokens": prompt_tokens,
-                            "completion_tokens": completion_tokens,
-                            "total_tokens": total_tokens,
-                        }
-                    )
-
-                if echo:
-                    text_outputs = [
-                        prompt + output.text for output in request_output.outputs
-                    ]
-                else:
-                    text_outputs = [output.text for output in request_output.outputs]
-
-                if sampling_params.best_of is None:
-                    text_outputs = [" ".join(text_outputs)]
-                ret = {"text": text_outputs, "error_code": 0, "usage": output_usage}
-                yield (json.dumps(ret) + "\0").encode()
-        except (ValueError, RuntimeError) as e:
-            ret = {
-                "text": f"{e}",
-                "error_code": ErrorCode.PARAM_OUT_OF_RANGE,
-                "usage": {},
-            }
+        sampling_params = SamplingParams(
+            n=1,
+            temperature=temperature,
+            top_p=top_p,
+            use_beam_search=False,
+            stop=list(stop),
+            max_tokens=max_new_tokens,
+        )
+        results_generator = engine.generate(context, sampling_params, request_id)
+
+        async for request_output in results_generator:
+            prompt = request_output.prompt
+            if echo:
+                text_outputs = [
+                    prompt + output.text for output in request_output.outputs
+                ]
+            else:
+                text_outputs = [output.text for output in request_output.outputs]
+            text_outputs = " ".join(text_outputs)
+            # Note: usage is not supported yet
+            ret = {"text": text_outputs, "error_code": 0, "usage": {}}
             yield (json.dumps(ret) + "\0").encode()
 
     async def generate(self, params):

From 11b05bb5b3556218d7712e2f4a20da222ecc6c4b Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 11 Sep 2023 23:32:07 +0000
Subject: [PATCH 20/45] Release a v0.2.28 with bug fixes and more test cases

---
 fastchat/__init__.py                   |  2 +-
 pyproject.toml                         |  2 +-
 tests/launch_openai_api_test_server.py | 22 ++++++++++++++--------
 tests/test_openai_api.py               |  5 ++++-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/fastchat/__init__.py b/fastchat/__init__.py
index be2d7c2ff..968391a2d 100644
--- a/fastchat/__init__.py
+++ b/fastchat/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.27"
+__version__ = "0.2.28"
diff --git a/pyproject.toml b/pyproject.toml
index 73dbdd8da..1b30b8881 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fschat"
-version = "0.2.27"
+version = "0.2.28"
 description = "An open platform for training, serving, and evaluating large language model based chatbots."
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/tests/launch_openai_api_test_server.py b/tests/launch_openai_api_test_server.py
index a58570fd6..f555a3882 100644
--- a/tests/launch_openai_api_test_server.py
+++ b/tests/launch_openai_api_test_server.py
@@ -13,17 +13,23 @@ def launch_process(cmd):
     launch_process("python3 -m fastchat.serve.openai_api_server")
 
     models = [
-        "lmsys/vicuna-7b-v1.5",
-        "lmsys/fastchat-t5-3b-v1.0",
-        "THUDM/chatglm-6b",
-        "mosaicml/mpt-7b-chat",
+        ("lmsys/vicuna-7b-v1.5", "model_worker"),
+        ("lmsys/fastchat-t5-3b-v1.0", "model_worker"),
+        ("THUDM/chatglm-6b", "model_worker"),
+        ("mosaicml/mpt-7b-chat", "model_worker"),
+        ("meta-llama/Llama-2-7b-chat-hf", "vllm_worker"),
     ]
 
-    for i, model_path in enumerate(models):
-        launch_process(
-            f"CUDA_VISIBLE_DEVICES={i} python3 -m fastchat.serve.model_worker "
-            f"--model-path {model_path} --port {30000+i} --worker http://localhost:{30000+i}"
+    for i, (model_path, worker_name) in enumerate(models):
+        cmd = (
+            f"CUDA_VISIBLE_DEVICES={i} python3 -m fastchat.serve.{worker_name} "
+            f"--model-path {model_path} --port {30000+i} "
+            f"--worker-address http://localhost:{30000+i} "
         )
+        if worker_name == "vllm_worker":
+            cmd += "--tokenizer hf-internal-testing/llama-tokenizer"
+
+        launch_process(cmd)
 
     while True:
         pass
diff --git a/tests/test_openai_api.py b/tests/test_openai_api.py
index f291b90a3..d79af8322 100644
--- a/tests/test_openai_api.py
+++ b/tests/test_openai_api.py
@@ -106,9 +106,12 @@ def test_openai_curl():
         print(f"===== Test {model} ======")
         test_completion(model)
         test_completion_stream(model)
-        test_embedding(model)
         test_chat_completion(model)
         test_chat_completion_stream(model)
+        try:
+            test_embedding(model)
+        except openai.error.APIError as e:
+            print(f"Embedding error: {e}")
 
     print("===== Test curl =====")
     test_openai_curl()

From a8088ba88f3b0611afe47559431375d18320b448 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan@huawei.com>
Date: Tue, 12 Sep 2023 12:02:59 +0800
Subject: [PATCH 21/45] Fix model_worker error (#2404)

---
 fastchat/serve/inference.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
index c97fd1c01..4e5191610 100644
--- a/fastchat/serve/inference.py
+++ b/fastchat/serve/inference.py
@@ -108,6 +108,7 @@ def generate_stream(
 
     past_key_values = out = None
     sent_interrupt = False
+    finish_reason = None
     for i in range(max_new_tokens):
         if i == 0:  # prefill
             if model.config.is_encoder_decoder:
@@ -240,12 +241,11 @@ def generate_stream(
             break
 
     # Finish stream event, which contains finish reason
-    if i == max_new_tokens - 1:
+    else:
         finish_reason = "length"
-    elif stopped:
+
+    if stopped:
         finish_reason = "stop"
-    else:
-        finish_reason = None
 
     yield {
         "text": output,

From b49d789417eba974a6cfd3855f4293bfeeeeb49f Mon Sep 17 00:00:00 2001
From: "Jeff (Zhen) Wang" <wangzhen263@gmail.com>
Date: Tue, 12 Sep 2023 14:04:46 +1000
Subject: [PATCH 22/45] Added google/flan models and fixed
 AutoModelForSeq2SeqLM when loading T5 compression model (#2402)

---
 fastchat/model/compression.py   | 16 ++++++++++++++--
 fastchat/model/model_adapter.py |  8 ++++++++
 pyproject.toml                  |  2 +-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/fastchat/model/compression.py b/fastchat/model/compression.py
index 4a1d2adb7..c928db154 100644
--- a/fastchat/model/compression.py
+++ b/fastchat/model/compression.py
@@ -11,7 +11,13 @@
 from torch.nn import functional as F
 import torch.nn as nn
 from tqdm import tqdm
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoModel,
+    AutoModelForSeq2SeqLM,
+)
 
 
 @dataclasses.dataclass
@@ -123,7 +129,13 @@ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="mai
         # some models are loaded by AutoModel but not AutoModelForCausalLM,
         # such as chatglm, chatglm2
         try:
-            model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+            # google/flan-* models are based on an AutoModelForSeq2SeqLM.
+            if "T5Config" in str(type(config)):
+                model = AutoModelForSeq2SeqLM.from_config(
+                    config, trust_remote_code=True
+                )
+            else:
+                model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
         except NameError:
             model = AutoModel.from_config(config, trust_remote_code=True)
         linear_weights = get_compressed_list(model)
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index f018c212e..423308455 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -649,6 +649,13 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         return model, tokenizer
 
 
+class FlanAdapter(T5Adapter):
+    """The model adapter for flan-t5-*, flan-ul2"""
+
+    def match(self, model_path: str):
+        return "flan" in model_path.lower()
+
+
 class KoalaAdapter(BaseModelAdapter):
     """The model adapter for koala"""
 
@@ -1592,6 +1599,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(LongChatAdapter)
 register_model_adapter(CodeT5pAdapter)
 register_model_adapter(T5Adapter)
+register_model_adapter(FlanAdapter)
 register_model_adapter(KoalaAdapter)
 register_model_adapter(AlpacaAdapter)
 register_model_adapter(ChatGLMAdapter)
diff --git a/pyproject.toml b/pyproject.toml
index 1b30b8881..c3ce59364 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0"]
+model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf"]
 webui = ["gradio"]
 train = ["einops", "flash-attn>=2.0", "wandb"]
 llm_judge = ["openai", "anthropic>=0.3", "ray"]

From 7dfcf1ab74e47dfc8eb81cfbcf50dc24dd3cd36d Mon Sep 17 00:00:00 2001
From: karshPrime <94996251+karshPrime@users.noreply.github.com>
Date: Tue, 12 Sep 2023 18:24:30 +1000
Subject: [PATCH 23/45] Rename twitter to X (#2406)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7e8d7381e..75a0f3d80 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # FastChat
-| [**Demo**](https://chat.lmsys.org/) | [**Discord**](https://discord.gg/HSWAKCrnFx) | [**Twitter**](https://twitter.com/lmsysorg) |
+| [**Demo**](https://chat.lmsys.org/) | [**Discord**](https://discord.gg/HSWAKCrnFx) | [**X**](https://x.com/lmsysorg) |
 
 FastChat is an open platform for training, serving, and evaluating large language model based chatbots. The core features include:
 - The weights, training code, and evaluation code for state-of-the-art models (e.g., Vicuna).

From aa153d53d1deeec1586ff836cb3caab05c4080ad Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 12 Sep 2023 10:10:49 -0700
Subject: [PATCH 24/45] Update huggingface_api.py (#2409)

---
 fastchat/serve/huggingface_api.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fastchat/serve/huggingface_api.py b/fastchat/serve/huggingface_api.py
index 7bf16a882..5a4c30fec 100644
--- a/fastchat/serve/huggingface_api.py
+++ b/fastchat/serve/huggingface_api.py
@@ -14,6 +14,7 @@
 
 @torch.inference_mode()
 def main(args):
+    # Load model
     model, tokenizer = load_model(
         args.model_path,
         device=args.device,
@@ -25,13 +26,14 @@ def main(args):
         debug=args.debug,
     )
 
+    # Build the prompt with a conversation template
     msg = args.message
-
     conv = get_conversation_template(args.model_path)
     conv.append_message(conv.roles[0], msg)
     conv.append_message(conv.roles[1], None)
     prompt = conv.get_prompt()
 
+    # Run inference
     inputs = tokenizer([prompt], return_tensors="pt").to(args.device)
     output_ids = model.generate(
         **inputs,
@@ -49,6 +51,7 @@ def main(args):
         output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
     )
 
+    # Print results
     print(f"{conv.roles[0]}: {msg}")
     print(f"{conv.roles[1]}: {outputs}")
 

From 3149253988ee16b0945aa0a381a42a07b8a7829e Mon Sep 17 00:00:00 2001
From: obitolyz <obitoquilt@qq.com>
Date: Wed, 13 Sep 2023 13:07:34 +0800
Subject: [PATCH 25/45] Add support for baichuan2 models (#2408)

---
 fastchat/conversation.py        | 14 ++++++++++++++
 fastchat/model/model_adapter.py |  2 ++
 2 files changed, 16 insertions(+)

diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index 73fb541f1..fcf882c5c 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -804,6 +804,20 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+# Baichuan2-13B-Chat template
+register_conv_template(
+    # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
+    # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
+    # https://github.com/baichuan-inc/Baichuan2/issues/62
+    Conversation(
+        name="baichuan2-chat",
+        roles=("<reserved_106>", "<reserved_107>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[],
+    )
+)
+
 # llama2 template
 # reference: https://huggingface.co/blog/codellama#conversational-instructions
 # reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 423308455..296b53c8f 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -1172,6 +1172,8 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
     def get_default_conv_template(self, model_path: str) -> Conversation:
         # for Baichuan-13B-Chat
         if "chat" in model_path.lower():
+            if "baichuan2" in model_path.lower():
+                return get_conv_template("baichuan2-chat")
             return get_conv_template("baichuan-chat")
         return get_conv_template("zero_shot")
 

From 2e0e60b4db2613a47632edcbff458ebee7fa85ee Mon Sep 17 00:00:00 2001
From: Shangwei Chen <109785802+Somezak1@users.noreply.github.com>
Date: Mon, 18 Sep 2023 09:17:52 +0800
Subject: [PATCH 26/45] Fixed character overlap issue when api streaming output
 (#2431)

---
 fastchat/serve/openai_api_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index 02e8481f4..7b596de64 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -435,7 +435,7 @@ async def chat_completion_stream_generator(
                 return
             decoded_unicode = content["text"].replace("\ufffd", "")
             delta_text = decoded_unicode[len(previous_text) :]
-            previous_text = decoded_unicode
+            previous_text = decoded_unicode if len(decoded_unicode) > len(previous_text) else previous_text
 
             if len(delta_text) == 0:
                 delta_text = None
@@ -554,7 +554,7 @@ async def generate_completion_stream_generator(
                     return
                 decoded_unicode = content["text"].replace("\ufffd", "")
                 delta_text = decoded_unicode[len(previous_text) :]
-                previous_text = decoded_unicode
+                previous_text = decoded_unicode if len(decoded_unicode) > len(previous_text) else previous_text
                 # todo: index is not apparent
                 choice_data = CompletionResponseStreamChoice(
                     index=i,

From c7e3e67b95b5ee6979d8e2b5623622f524d5bb33 Mon Sep 17 00:00:00 2001
From: HyungJin Ahn <crushed7@o.cnu.ac.kr>
Date: Mon, 18 Sep 2023 10:18:38 +0900
Subject: [PATCH 27/45] Support custom conversation template in
 multi_model_worker (#2434)

---
 fastchat/serve/multi_model_worker.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/fastchat/serve/multi_model_worker.py b/fastchat/serve/multi_model_worker.py
index 098c6d11e..13872bbdd 100644
--- a/fastchat/serve/multi_model_worker.py
+++ b/fastchat/serve/multi_model_worker.py
@@ -178,6 +178,13 @@ def create_multi_model_worker():
         action="append",
         help="One or more model names.  Values must be aligned with `--model-path` values.",
     )
+    parser.add_argument(
+        "--conv-template",
+        type=str,
+        default=None,
+        action="append",
+        help="Conversation prompt template. Values must be aligned with `--model-path` values. If only one value is provided, it will be repeated for all models.",
+    )
     parser.add_argument("--limit-worker-concurrency", type=int, default=5)
     parser.add_argument("--stream-interval", type=int, default=2)
     parser.add_argument("--no-register", action="store_true")
@@ -201,9 +208,16 @@ def create_multi_model_worker():
     if args.model_names is None:
         args.model_names = [[x.split("/")[-1]] for x in args.model_path]
 
+    if args.conv_template is None:
+        args.conv_template = [None] * len(args.model_path)
+    elif len(args.conv_template) == 1:  # Repeat the same template
+        args.conv_template = args.conv_template * len(args.model_path)
+
     # Launch all workers
     workers = []
-    for model_path, model_names in zip(args.model_path, args.model_names):
+    for conv_template, model_path, model_names in zip(
+        args.conv_template, args.model_path, args.model_names
+    ):
         w = ModelWorker(
             args.controller_address,
             args.worker_address,
@@ -219,6 +233,7 @@ def create_multi_model_worker():
             cpu_offloading=args.cpu_offloading,
             gptq_config=gptq_config,
             stream_interval=args.stream_interval,
+            conv_template=conv_template,
         )
         workers.append(w)
         for model_name in model_names:

From c685951d71f263da268129aec70103fbc9fcf65e Mon Sep 17 00:00:00 2001
From: zhangsibo1129 <134488188+zhangsibo1129@users.noreply.github.com>
Date: Mon, 18 Sep 2023 09:38:32 +0800
Subject: [PATCH 28/45] Add Ascend NPU support (#2422)

---
 README.md                          | 41 ++++++++++++++++++++++++++++++
 fastchat/model/compression.py      |  2 ++
 fastchat/model/model_adapter.py    | 10 +++++++-
 fastchat/model/model_codet5p.py    |  2 ++
 fastchat/model/model_falcon.py     |  2 ++
 fastchat/serve/inference.py        |  2 ++
 fastchat/serve/launch_all_serve.py |  2 +-
 fastchat/serve/model_worker.py     |  2 ++
 8 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 75a0f3d80..d1cca9332 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,18 @@ python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device xpu
 ```
 Vicuna-7B can run on an Intel Arc A770 16GB.
 
+#### Ascend NPU (Huawei AI Processor)
+Install the [Ascend PyTorch Adapter](https://github.com/Ascend/pytorch). Set the CANN environment variables:
+```
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+```
+
+Use `--device npu` to enable NPU acceleration.
+```
+python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3 --device npu
+```
+Vicuna-7B/13B can run on an Ascend 910B NPU 60GB.
+
 #### Not Enough Memory
 If you do not have enough memory, you can enable 8-bit compression by adding `--load-8bit` to commands above.
 This can reduce memory usage by around half with slightly degraded model quality.
@@ -301,6 +313,35 @@ Tips:
 - If you meet out-of-memory due to "FSDP Warning: When using FSDP, it is efficient and recommended... ", see solutions [here](https://github.com/huggingface/transformers/issues/24724#issuecomment-1645189539).
 - If you meet out-of-memory during model saving, see solutions [here](https://github.com/pytorch/pytorch/issues/98823).
 
+### Fine-tuning Vicuna-7B with Local NPUs
+
+You can use the following command to train Vicuna-7B with 8 x 910B (60GB). Use `--nproc_per_node` to specify the number of NPUs.
+```bash
+torchrun --nproc_per_node=8 --master_port=20001 fastchat/train/train.py \
+    --model_name_or_path ~/vicuna-7b-v1.5-16k  \
+    --data_path data/dummy_conversation.json \
+    --fp16 True \
+    --output_dir output_vicuna \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1200 \
+    --save_total_limit 10 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True
+```
+
 ### Other models and LoRA support
 More instructions to train other models (e.g., FastChat-T5) and use LoRA are in [docs/training.md](docs/training.md).
 
diff --git a/fastchat/model/compression.py b/fastchat/model/compression.py
index c928db154..e80d9aaba 100644
--- a/fastchat/model/compression.py
+++ b/fastchat/model/compression.py
@@ -193,6 +193,8 @@ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="mai
             torch.cuda.empty_cache()
             if device == "xpu":
                 torch.xpu.empty_cache()
+            if device == "npu":
+                torch.npu.empty_cache()
 
     for name in model.state_dict():
         if name not in linear_weights:
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 296b53c8f..a90aa61df 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -206,6 +206,13 @@ def load_model(
             warnings.warn(
                 "Intel Extension for PyTorch is not installed, but is required for xpu inference."
             )
+    elif device == "npu":
+        kwargs = {"torch_dtype": torch.float16}
+        # Try to load ipex, while it looks unused, it links into torch for xpu support
+        try:
+            import torch_npu
+        except ImportError:
+            warnings.warn("Ascend Extension for PyTorch is not installed.")
     else:
         raise ValueError(f"Invalid device: {device}")
 
@@ -288,6 +295,7 @@ def load_model(
     if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
         "mps",
         "xpu",
+        "npu",
     ):
         model.to(device)
 
@@ -369,7 +377,7 @@ def add_model_args(parser):
     parser.add_argument(
         "--device",
         type=str,
-        choices=["cpu", "cuda", "mps", "xpu"],
+        choices=["cpu", "cuda", "mps", "xpu", "npu"],
         default="cuda",
         help="The device type",
     )
diff --git a/fastchat/model/model_codet5p.py b/fastchat/model/model_codet5p.py
index 63481bc5e..0984513c9 100644
--- a/fastchat/model/model_codet5p.py
+++ b/fastchat/model/model_codet5p.py
@@ -104,3 +104,5 @@ def __call__(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
diff --git a/fastchat/model/model_falcon.py b/fastchat/model/model_falcon.py
index 20afc4f0f..dc8af8efa 100644
--- a/fastchat/model/model_falcon.py
+++ b/fastchat/model/model_falcon.py
@@ -136,3 +136,5 @@ def generate_stream_falcon(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
index 4e5191610..700bcbfed 100644
--- a/fastchat/serve/inference.py
+++ b/fastchat/serve/inference.py
@@ -263,6 +263,8 @@ def generate_stream(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
 
 
 class ChatIO(abc.ABC):
diff --git a/fastchat/serve/launch_all_serve.py b/fastchat/serve/launch_all_serve.py
index 7847f0064..1952cfb17 100644
--- a/fastchat/serve/launch_all_serve.py
+++ b/fastchat/serve/launch_all_serve.py
@@ -66,7 +66,7 @@
 parser.add_argument(
     "--device",
     type=str,
-    choices=["cpu", "cuda", "mps", "xpu"],
+    choices=["cpu", "cuda", "mps", "xpu", "npu"],
     default="cuda",
     help="The device type",
 )
diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py
index dac3764d4..470bc5eea 100644
--- a/fastchat/serve/model_worker.py
+++ b/fastchat/serve/model_worker.py
@@ -370,6 +370,8 @@ def get_embeddings(self, params):
             torch.cuda.empty_cache()
             if self.device == "xpu":
                 torch.xpu.empty_cache()
+            if self.device == "npu":
+                torch.npu.empty_cache()
         except torch.cuda.OutOfMemoryError as e:
             ret = {
                 "text": f"{SERVER_ERROR_MSG}\n\n({e})",

From 54a8353b6515893be2eaabc892002f7f55dd6966 Mon Sep 17 00:00:00 2001
From: Tobias Birchler <tobias@birchlerfamily.ch>
Date: Mon, 18 Sep 2023 03:48:20 +0200
Subject: [PATCH 29/45] Add raw conversation template (#2417) (#2418)

---
 fastchat/conversation.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index fcf882c5c..fe6a849a7 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -285,6 +285,17 @@ def get_conv_template(name: str) -> Conversation:
     return conv_templates[name].copy()
 
 
+# An empty template for raw conversation.
+register_conv_template(
+    Conversation(
+        name="raw",
+        system_message="",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+    )
+)
+
 # A template with a one-shot conversation example
 register_conv_template(
     Conversation(

From 1119c51705f5d2caace575580f68b39294193c4b Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 17 Sep 2023 18:52:36 -0700
Subject: [PATCH 30/45] Improve docs & UI (#2436)

---
 README.md                                  | 33 ++--------------------
 docs/training.md                           | 29 +++++++++++++++++++
 fastchat/constants.py                      |  2 +-
 fastchat/llm_judge/README.md               |  2 +-
 fastchat/serve/gradio_block_arena_anony.py |  8 ++++--
 fastchat/serve/gradio_block_arena_named.py |  4 ++-
 fastchat/serve/gradio_web_server.py        |  5 +++-
 fastchat/serve/monitor/monitor.py          |  2 +-
 fastchat/serve/openai_api_server.py        | 12 ++++++--
 9 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index d1cca9332..7d3c7b20b 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 | [**Demo**](https://chat.lmsys.org/) | [**Discord**](https://discord.gg/HSWAKCrnFx) | [**X**](https://x.com/lmsysorg) |
 
 FastChat is an open platform for training, serving, and evaluating large language model based chatbots. The core features include:
-- The weights, training code, and evaluation code for state-of-the-art models (e.g., Vicuna).
+- The training and evaluation code for state-of-the-art models (e.g., Vicuna).
 - A distributed multi-model serving system with web UI and OpenAI-compatible RESTful APIs.
 
 ## News
@@ -313,36 +313,7 @@ Tips:
 - If you meet out-of-memory due to "FSDP Warning: When using FSDP, it is efficient and recommended... ", see solutions [here](https://github.com/huggingface/transformers/issues/24724#issuecomment-1645189539).
 - If you meet out-of-memory during model saving, see solutions [here](https://github.com/pytorch/pytorch/issues/98823).
 
-### Fine-tuning Vicuna-7B with Local NPUs
-
-You can use the following command to train Vicuna-7B with 8 x 910B (60GB). Use `--nproc_per_node` to specify the number of NPUs.
-```bash
-torchrun --nproc_per_node=8 --master_port=20001 fastchat/train/train.py \
-    --model_name_or_path ~/vicuna-7b-v1.5-16k  \
-    --data_path data/dummy_conversation.json \
-    --fp16 True \
-    --output_dir output_vicuna \
-    --num_train_epochs 3 \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --evaluation_strategy "no" \
-    --save_strategy "steps" \
-    --save_steps 1200 \
-    --save_total_limit 10 \
-    --learning_rate 2e-5 \
-    --weight_decay 0. \
-    --warmup_ratio 0.03 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --fsdp "full_shard auto_wrap" \
-    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
-    --model_max_length 2048 \
-    --gradient_checkpointing True \
-    --lazy_preprocess True
-```
-
-### Other models and LoRA support
+### Other models, platforms and LoRA support
 More instructions to train other models (e.g., FastChat-T5) and use LoRA are in [docs/training.md](docs/training.md).
 
 ### Fine-tuning on Any Cloud with SkyPilot
diff --git a/docs/training.md b/docs/training.md
index 05cbf894d..077221824 100644
--- a/docs/training.md
+++ b/docs/training.md
@@ -87,3 +87,32 @@ deepspeed fastchat/train/train_lora_t5.py \
         --deepspeed playground/deepspeed_config_s2.json
         
 ```
+
+### Fine-tuning Vicuna-7B with Local NPUs
+
+You can use the following command to train Vicuna-7B with 8 x 910B (60GB). Use `--nproc_per_node` to specify the number of NPUs.
+```bash
+torchrun --nproc_per_node=8 --master_port=20001 fastchat/train/train.py \
+    --model_name_or_path ~/vicuna-7b-v1.5-16k  \
+    --data_path data/dummy_conversation.json \
+    --fp16 True \
+    --output_dir output_vicuna \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1200 \
+    --save_total_limit 10 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True
+```
diff --git a/fastchat/constants.py b/fastchat/constants.py
index 0eb7af371..c26c5f489 100644
--- a/fastchat/constants.py
+++ b/fastchat/constants.py
@@ -15,7 +15,7 @@
 CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
 INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
 # Maximum input length
-INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 2560))
+INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 3072))
 # Maximum conversation turns
 CONVERSATION_TURN_LIMIT = 50
 # Session expiration time
diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md
index 9338b1b86..e709db3be 100644
--- a/fastchat/llm_judge/README.md
+++ b/fastchat/llm_judge/README.md
@@ -1,5 +1,5 @@
 # LLM Judge
-| [Paper](https://arxiv.org/abs/2306.05685) | [Leaderboard](https://chat.lmsys.org/?leaderboard) |
+| [Paper](https://arxiv.org/abs/2306.05685) | [Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) |
 
 In this package, you can use MT-bench questions and prompts to evaluate your models with LLM-as-a-judge.
 MT-bench is a set of challenging multi-turn open-ended questions for evaluating chat assistants.
diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index a598a8c9a..a92cd9790 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -196,7 +196,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "chatglm-6b": 0.5,
 }
 
-SAMPLING_BOOST_MODELS = ["wizardlm-70b"]
+SAMPLING_BOOST_MODELS = []
 
 model_pairs = []
 model_pairs_weights = []
@@ -372,12 +372,13 @@ def bot_response_multi(
 def build_side_by_side_ui_anony(models):
     notice_markdown = """
 # ⚔️  Chatbot Arena ⚔️ : Benchmarking LLMs in the Wild
+| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
+
 ### Rules
 - Chat with two anonymous models side-by-side and vote for which one is better!
 - You can do multiple turns of conversations before voting.
 - The names of the models will be revealed after your vote. Conversations with identity keywords (e.g., ChatGPT, Bard, Vicuna) or any votes after the names are revealed will not count towards the leaderboard.
 - Click "Clear history" to start a new round.
-- | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 
 ### Leaderboard
 See [lmsys/chatbot-arena-leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) or the 4th tab above on this page.
@@ -386,7 +387,7 @@ def build_side_by_side_ui_anony(models):
 By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. **The service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license.** The demo works better on desktop devices with a wide screen.
 
 ### Battle
-Please scroll down and start chatting. The models include both closed-source models (e.g., ChatGPT) and open-source models (e.g., Llama, Vicuna).
+Please scroll down and start chatting. The models include both closed-source models (e.g., ChatGPT) and open-source models (e.g., Llama).
 """
 
     states = [gr.State() for _ in range(num_sides)]
@@ -423,6 +424,7 @@ def build_side_by_side_ui_anony(models):
                 placeholder="Enter your prompt here and press ENTER",
                 visible=False,
                 container=False,
+                elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
             send_btn = gr.Button(value="Battle", visible=False, variant="primary")
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
index c031d28c2..868a5759a 100644
--- a/fastchat/serve/gradio_block_arena_named.py
+++ b/fastchat/serve/gradio_block_arena_named.py
@@ -299,12 +299,13 @@ def flash_buttons():
 def build_side_by_side_ui_named(models):
     notice_markdown = """
 # ⚔️  Chatbot Arena ⚔️ : Benchmarking LLMs in the Wild
+| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
+
 ### Rules
 - Chat with two models side-by-side and vote for which one is better!
 - You pick the models you want to chat with.
 - You can do multiple turns of conversations before voting.
 - Click "Clear history" to start a new round.
-- | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 
 ### Terms of use
 By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. **The service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license.** The demo works better on desktop devices with a wide screen.
@@ -355,6 +356,7 @@ def build_side_by_side_ui_named(models):
                 placeholder="Enter your prompt here and press ENTER",
                 visible=False,
                 container=False,
+                elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
             send_btn = gr.Button(value="Battle", visible=False, variant="primary")
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
index 2fae670dc..b17cee42c 100644
--- a/fastchat/serve/gradio_web_server.py
+++ b/fastchat/serve/gradio_web_server.py
@@ -500,6 +500,8 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request)
 #leaderboard_dataframe td {
     line-height: 0.1em;
 }
+#input_box textarea {
+}
 footer {
     display:none !important
 }
@@ -550,9 +552,9 @@ def get_model_description_md(models):
 def build_single_model_ui(models, add_promotion_links=False):
     promotion = (
         """
+- | [GitHub](https://github.com/lm-sys/FastChat) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 - Introducing Llama 2: The Next Generation Open Source Large Language Model. [[Website]](https://ai.meta.com/llama/)
 - Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90% ChatGPT Quality. [[Blog]](https://lmsys.org/blog/2023-03-30-vicuna/)
-- | [GitHub](https://github.com/lm-sys/FastChat) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 """
         if add_promotion_links
         else ""
@@ -594,6 +596,7 @@ def build_single_model_ui(models, add_promotion_links=False):
                 placeholder="Enter your prompt here and press ENTER",
                 visible=False,
                 container=False,
+                elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
             send_btn = gr.Button(value="Send", visible=False, variant="primary")
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
index b2081bc0d..d576b42b0 100644
--- a/fastchat/serve/monitor/monitor.py
+++ b/fastchat/serve/monitor/monitor.py
@@ -34,7 +34,7 @@ def make_leaderboard_md(elo_results):
 - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
 - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
 
-💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023.
+💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023.
 """
     return leaderboard_md
 
diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index 7b596de64..d692af967 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -435,7 +435,11 @@ async def chat_completion_stream_generator(
                 return
             decoded_unicode = content["text"].replace("\ufffd", "")
             delta_text = decoded_unicode[len(previous_text) :]
-            previous_text = decoded_unicode if len(decoded_unicode) > len(previous_text) else previous_text
+            previous_text = (
+                decoded_unicode
+                if len(decoded_unicode) > len(previous_text)
+                else previous_text
+            )
 
             if len(delta_text) == 0:
                 delta_text = None
@@ -554,7 +558,11 @@ async def generate_completion_stream_generator(
                     return
                 decoded_unicode = content["text"].replace("\ufffd", "")
                 delta_text = decoded_unicode[len(previous_text) :]
-                previous_text = decoded_unicode if len(decoded_unicode) > len(previous_text) else previous_text
+                previous_text = (
+                    decoded_unicode
+                    if len(decoded_unicode) > len(previous_text)
+                    else previous_text
+                )
                 # todo: index is not apparent
                 choice_data = CompletionResponseStreamChoice(
                     index=i,

From 658736fc45356e574ee62e991603307ffa4c8f55 Mon Sep 17 00:00:00 2001
From: Jae-Won Chung <jwnchung@umich.edu>
Date: Sun, 17 Sep 2023 21:53:15 -0400
Subject: [PATCH 31/45] Fix Salesforce xgen inference (#2350)

---
 fastchat/conversation.py    | 7 +++----
 fastchat/serve/inference.py | 3 ++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index fe6a849a7..94a9b47f3 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -765,11 +765,10 @@ def get_conv_template(name: str) -> Conversation:
     Conversation(
         name="xgen",
         system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
-        roles=("### Human: ", "###"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        roles=("### Human", "### Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
         sep="\n",
-        stop_token_ids=[50256, 0, 1, 2],
-        stop_str="<|endoftext|>",
+        stop_token_ids=[50256],
     )
 )
 
diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
index 700bcbfed..dac10d542 100644
--- a/fastchat/serve/inference.py
+++ b/fastchat/serve/inference.py
@@ -80,7 +80,8 @@ def generate_stream(
     echo = bool(params.get("echo", True))
     stop_str = params.get("stop", None)
     stop_token_ids = params.get("stop_token_ids", None) or []
-    stop_token_ids.append(tokenizer.eos_token_id)
+    if tokenizer.eos_token_id not in stop_token_ids:
+        stop_token_ids.append(tokenizer.eos_token_id)
 
     logits_processor = prepare_logits_processor(
         temperature, repetition_penalty, top_p, top_k

From d26d9e711755826aebe34a489286d97ab0b714e9 Mon Sep 17 00:00:00 2001
From: Tobias Birchler <tobias@birchlerfamily.ch>
Date: Mon, 18 Sep 2023 03:58:03 +0200
Subject: [PATCH 32/45] Add support for Phind-CodeLlama models (#2415) (#2416)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
---
 fastchat/conversation.py        | 13 +++++++++++++
 fastchat/model/model_adapter.py | 11 +++++++++++
 2 files changed, 24 insertions(+)

diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index 94a9b47f3..9a485b815 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -940,6 +940,19 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+# Phind template
+register_conv_template(
+    Conversation(
+        name="phind",
+        system_message="### System Prompt\nYou are an intelligent programming assistant.",
+        roles=("### User Message", "### Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+    )
+)
+
 
 if __name__ == "__main__":
     print("Vicuna template:")
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index a90aa61df..028ac91f1 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -1601,6 +1601,16 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("llama-2")
 
 
+class PhindCodeLlamaAdapter(CodeLlamaAdapter):
+    """The model adapter for Phind Code Llama"""
+
+    def match(self, model_path: str):
+        return "phind-codellama-" in model_path.lower()
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("phind")
+
+
 # Note: the registration order matters.
 # The one registered earlier has a higher matching priority.
 register_model_adapter(PeftModelAdapter)
@@ -1658,6 +1668,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(VigogneChatAdapter)
 register_model_adapter(OpenLLaMaOpenInstructAdapter)
 register_model_adapter(ReaLMAdapter)
+register_model_adapter(PhindCodeLlamaAdapter)
 register_model_adapter(CodeLlamaAdapter)
 
 # After all adapters, try the default base adapter.

From 0a5f50377a43a9fce97b5f535809e0e8a8b5a577 Mon Sep 17 00:00:00 2001
From: Mingdao Liu <joshua@btlmd.com>
Date: Mon, 18 Sep 2023 10:01:58 +0800
Subject: [PATCH 33/45] Add falcon 180B chat conversation template (#2384)

---
 fastchat/conversation.py        | 25 +++++++++++++++++++++++++
 fastchat/model/model_adapter.py | 11 ++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index 9a485b815..76e4f151d 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -27,6 +27,7 @@ class SeparatorStyle(IntEnum):
     RWKV = auto()
     PHOENIX = auto()
     ROBIN = auto()
+    FALCON_CHAT = auto()
 
 
 @dataclasses.dataclass
@@ -200,6 +201,17 @@ def get_prompt(self) -> str:
                 else:
                     ret += role + ":\n"
             return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += "System: " + self.system_message + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "
+
+            return ret
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
 
@@ -940,6 +952,19 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+# Falcon 180B chat template
+register_conv_template(
+    Conversation(
+        name="falcon-chat",
+        roles=("User", "Falcon"),
+        messages=[],
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="\n",
+        sep2="<|endoftext|>",
+        stop_str="\nUser:",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
+    )
+)
+
 # Phind template
 register_conv_template(
     Conversation(
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 028ac91f1..e6b7bd57e 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -1112,7 +1112,7 @@ class FalconAdapter(BaseModelAdapter):
     """The model adapter for tiiuae/falcon-40b"""
 
     def match(self, model_path: str):
-        return "falcon" in model_path.lower()
+        return "falcon" in model_path.lower() and "chat" not in model_path.lower()
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         revision = from_pretrained_kwargs.get("revision", "main")
@@ -1133,6 +1133,14 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("falcon")
 
 
+class FalconChatAdapter(BaseModelAdapter):
+    def match(self, model_path: str):
+        return "falcon" in model_path.lower() and "chat" in model_path.lower()
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("falcon-chat")
+
+
 class TigerBotAdapter(BaseModelAdapter):
     """The model adapter for TigerResearch/tigerbot-7b-sft"""
 
@@ -1647,6 +1655,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(CamelAdapter)
 register_model_adapter(ChangGPTAdapter)
 register_model_adapter(TuluAdapter)
+register_model_adapter(FalconChatAdapter)
 register_model_adapter(FalconAdapter)
 register_model_adapter(TigerBotAdapter)
 register_model_adapter(BaichuanAdapter)

From 318d07071081176e8f21472cf337e0503117e820 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 17 Sep 2023 19:33:55 -0700
Subject: [PATCH 34/45] Improve docs (#2438)

---
 docs/model_support.md            |  4 +++-
 fastchat/conversation.py         |  2 ++
 fastchat/model/model_adapter.py  | 24 ++++++++++++------------
 fastchat/model/model_registry.py | 11 +++++++++--
 4 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/docs/model_support.md b/docs/model_support.md
index 8c1a58eea..a9eb4c895 100644
--- a/docs/model_support.md
+++ b/docs/model_support.md
@@ -31,6 +31,7 @@
 - [openaccess-ai-collective/manticore-13b-chat-pyg](https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg)
 - [OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5](https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5)
 - [VMware/open-llama-7b-v2-open-instruct](https://huggingface.co/VMware/open-llama-7b-v2-open-instruct)
+- [Phind/Phind-CodeLlama-34B-v2](https://huggingface.co/Phind/Phind-CodeLlama-34B-v2)
 - [project-baize/baize-v2-7b](https://huggingface.co/project-baize/baize-v2-7b)
 - [Qwen/Qwen-7B-Chat](https://huggingface.co/Qwen/Qwen-7B-Chat)
 - [Salesforce/codet5p-6b](https://huggingface.co/Salesforce/codet5p-6b)
@@ -38,6 +39,7 @@
 - [THUDM/chatglm-6b](https://huggingface.co/THUDM/chatglm-6b)
 - [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)
 - [tiiuae/falcon-40b](https://huggingface.co/tiiuae/falcon-40b)
+- [tiiuae/falcon-180B-chat](https://huggingface.co/tiiuae/falcon-180B-chat)
 - [timdettmers/guanaco-33b-merged](https://huggingface.co/timdettmers/guanaco-33b-merged)
 - [togethercomputer/RedPajama-INCITE-7B-Chat](https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat)
 - [WizardLM/WizardLM-13B-V1.0](https://huggingface.co/WizardLM/WizardLM-13B-V1.0)
@@ -71,7 +73,7 @@ You can add `--debug` to see the actual prompt sent to the model.
 
 FastChat uses the `Conversation` class to handle prompt templates and `BaseModelAdapter` class to handle model loading.
 
-1. Implement a conversation template for the new model at [fastchat/conversation.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py). You can follow existing examples and use `register_conv_template` to add a new one.
+1. Implement a conversation template for the new model at [fastchat/conversation.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py). You can follow existing examples and use `register_conv_template` to add a new one. Please also add a link to the official reference code if possible.
 2. Implement a model adapter for the new model at [fastchat/model/model_adapter.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_adapter.py). You can follow existing examples and use `register_model_adapter` to add a new one.
 3. (Optional) add the model name to the "Supported models" [section](#supported-models) above and add more information in [fastchat/model/model_registry.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_registry.py).
 
diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index 76e4f151d..763856f85 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -953,6 +953,7 @@ def get_conv_template(name: str) -> Conversation:
 )
 
 # Falcon 180B chat template
+# source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
 register_conv_template(
     Conversation(
         name="falcon-chat",
@@ -966,6 +967,7 @@ def get_conv_template(name: str) -> Conversation:
 )
 
 # Phind template
+# source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
 register_conv_template(
     Conversation(
         name="phind",
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index e6b7bd57e..a3b2632db 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -665,7 +665,7 @@ def match(self, model_path: str):
 
 
 class KoalaAdapter(BaseModelAdapter):
-    """The model adapter for koala"""
+    """The model adapter for Koala"""
 
     use_fast_tokenizer = False
 
@@ -677,7 +677,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class AlpacaAdapter(BaseModelAdapter):
-    """The model adapter for alpaca"""
+    """The model adapter for Alpaca"""
 
     use_fast_tokenizer = False
 
@@ -1267,7 +1267,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class Llama2Adapter(BaseModelAdapter):
-    """The model adapter for llama-2"""
+    """The model adapter for Llama-2 (e.g., meta-llama/Llama-2-7b-hf)"""
 
     def match(self, model_path: str):
         return "llama-2" in model_path.lower()
@@ -1283,7 +1283,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class CuteGPTAdapter(BaseModelAdapter):
-    """The model adapter for llama-2"""
+    """The model adapter for CuteGPT"""
 
     def match(self, model_path: str):
         return "cutegpt" in model_path.lower()
@@ -1327,7 +1327,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class WizardCoderAdapter(BaseModelAdapter):
-    """The model adapter for WizardCoder"""
+    """The model adapter for WizardCoder (e.g., WizardLM/WizardCoder-Python-34B-V1.0)"""
 
     use_fast_tokenizer = False
 
@@ -1401,7 +1401,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class BGEAdapter(BaseModelAdapter):
-    """The model adapter for BGE"""
+    """The model adapter for BGE (e.g., BAAI/bge-large-en-v1.5)"""
 
     use_fast_tokenizer = False
 
@@ -1430,7 +1430,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class E5Adapter(BaseModelAdapter):
-    """The model adapter for E5"""
+    """The model adapter for E5 (e.g., intfloat/e5-large-v2)"""
 
     use_fast_tokenizer = False
 
@@ -1508,7 +1508,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class VigogneInstructAdapter(BaseModelAdapter):
-    """The model adapter for Vigogne-Instruct"""
+    """The model adapter for Vigogne-Instruct (e.g., bofenghuang/vigogne-2-7b-instruct)"""
 
     use_fast_tokenizer = False
 
@@ -1536,7 +1536,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class VigogneChatAdapter(BaseModelAdapter):
-    """The model adapter for Vigogne-Chat"""
+    """The model adapter for Vigogne-Chat (e.g., bofenghuang/vigogne-7b-chat)"""
 
     use_fast_tokenizer = False
 
@@ -1564,7 +1564,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class OpenLLaMaOpenInstructAdapter(BaseModelAdapter):
-    """The model adapter for OpenLLaMa-Open-Instruct"""
+    """The model adapter for OpenLLaMa-Open-Instruct (e.g., VMware/open-llama-7b-open-instruct)"""
 
     use_fast_tokenizer = False
 
@@ -1594,7 +1594,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class CodeLlamaAdapter(BaseModelAdapter):
-    """The model adapter for Code Llama"""
+    """The model adapter for CodeLlama (e.g., codellama/CodeLlama-34b-hf)"""
 
     def match(self, model_path: str):
         return "codellama" in model_path.lower()
@@ -1610,7 +1610,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class PhindCodeLlamaAdapter(CodeLlamaAdapter):
-    """The model adapter for Phind Code Llama"""
+    """The model adapter for Phind-CodeLlama (e.g., Phind/Phind-CodeLlama-34B-v2)"""
 
     def match(self, model_path: str):
         return "phind-codellama-" in model_path.lower()
diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py
index aaf7e5e5f..0612ca832 100644
--- a/fastchat/model/model_registry.py
+++ b/fastchat/model/model_registry.py
@@ -252,9 +252,16 @@ def get_model_info(name: str) -> ModelInfo:
     "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together",
 )
 register_model_info(
-    ["falcon-7b", "falcon-7b-instruct", "falcon-40b", "falcon-40b-instruct"],
+    [
+        "falcon-7b",
+        "falcon-7b-instruct",
+        "falcon-40b",
+        "falcon-40b-instruct",
+        "falcon-180b",
+        "falcon-180b-chat",
+    ],
     "Falcon",
-    "https://huggingface.co/tiiuae/falcon-40b",
+    "https://huggingface.co/tiiuae/falcon-180B",
     "TII's flagship series of large language models",
 )
 register_model_info(

From 9cf3c8bdc8f4c18fa944f9dc13a0dd3783452e0b Mon Sep 17 00:00:00 2001
From: Ying Sheng <sqy1415@gmail.com>
Date: Sun, 17 Sep 2023 20:06:14 -0700
Subject: [PATCH 35/45] add dtype and seed (#2430)

---
 fastchat/llm_judge/README.md           |  4 +--
 fastchat/llm_judge/gen_model_answer.py | 42 ++++++++++++++++++--------
 fastchat/model/model_adapter.py        | 11 +++++++
 fastchat/serve/cli.py                  |  3 ++
 fastchat/serve/inference.py            |  2 ++
 fastchat/serve/model_worker.py         | 24 +++++++++++++--
 fastchat/utils.py                      | 15 +++++++++
 7 files changed, 84 insertions(+), 17 deletions(-)

diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md
index e709db3be..eb1e3b2e2 100644
--- a/fastchat/llm_judge/README.md
+++ b/fastchat/llm_judge/README.md
@@ -10,7 +10,7 @@ To automate the evaluation process, we prompt strong LLMs like GPT-4 to act as j
 - [Review Pre-Generated Model Answers and Judgments](#review-pre-generated-model-answers-and-judgments)
 - [MT-Bench](#mt-bench)
 - [Agreement Computation](#agreement-computation)
-- [Dataset](#dataset)
+- [Datasets](#datasets)
 - [Citation](#citation)
 
 ## Install
@@ -133,7 +133,7 @@ We released 3.3K human annotations for model responses generated by 6 models in
 
 This Colab [notebook](https://colab.research.google.com/drive/1ctgygDRJhVGUJTQy8-bRZCl1WNcT8De6?usp=sharing) shows how to compute the agreement between humans and GPT-4 judge with the dataset. Our results show that humans and GPT-4 judge achieve over 80\% agreement, the same level of agreement between humans.
 
-## Dataset
+## Datasets
 - [Chatbot Arena Conversation Dataset](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations)
 - [MT-bench Human Annotation Dataset](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments)
 
diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py
index 3d093ecd5..c36665b8f 100644
--- a/fastchat/llm_judge/gen_model_answer.py
+++ b/fastchat/llm_judge/gen_model_answer.py
@@ -15,6 +15,7 @@
 
 from fastchat.llm_judge.common import load_questions, temperature_config
 from fastchat.model import load_model, get_conversation_template
+from fastchat.utils import str_to_torch_dtype
 
 
 def run_eval(
@@ -29,6 +30,7 @@ def run_eval(
     num_gpus_per_model,
     num_gpus_total,
     max_gpu_memory,
+    dtype,
 ):
     questions = load_questions(question_file, question_begin, question_end)
     # random shuffle the questions to balance the loading
@@ -45,7 +47,7 @@ def run_eval(
     else:
         get_answers_func = get_model_answers
 
-    chunk_size = len(questions) // (num_gpus_total // num_gpus_per_model) // 2
+    chunk_size = len(questions) // (num_gpus_total // num_gpus_per_model)
     ans_handles = []
     for i in range(0, len(questions), chunk_size):
         ans_handles.append(
@@ -58,6 +60,7 @@ def run_eval(
                 num_choices,
                 num_gpus_per_model,
                 max_gpu_memory,
+                dtype=dtype,
             )
         )
 
@@ -75,12 +78,14 @@ def get_model_answers(
     num_choices,
     num_gpus_per_model,
     max_gpu_memory,
+    dtype,
 ):
     model, tokenizer = load_model(
         model_path,
         device="cuda",
         num_gpus=num_gpus_per_model,
         max_gpu_memory=max_gpu_memory,
+        dtype=dtype,
         load_8bit=False,
         cpu_offloading=False,
         debug=False,
@@ -192,7 +197,9 @@ def reorg_answer_file(answer_file):
         required=True,
         help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
     )
-    parser.add_argument("--model-id", type=str, required=True)
+    parser.add_argument(
+        "--model-id", type=str, required=True, help="A custom name for the model."
+    )
     parser.add_argument(
         "--bench-name",
         type=str,
@@ -234,6 +241,14 @@ def reorg_answer_file(answer_file):
         type=str,
         help="Maxmum GPU memory used for model weights per GPU.",
     )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16"],
+        help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
+        default=None,
+    )
+
     args = parser.parse_args()
 
     if args.num_gpus_total // args.num_gpus_per_model > 1:
@@ -250,17 +265,18 @@ def reorg_answer_file(answer_file):
     print(f"Output to {answer_file}")
 
     run_eval(
-        args.model_path,
-        args.model_id,
-        question_file,
-        args.question_begin,
-        args.question_end,
-        answer_file,
-        args.max_new_token,
-        args.num_choices,
-        args.num_gpus_per_model,
-        args.num_gpus_total,
-        args.max_gpu_memory,
+        model_path=args.model_path,
+        model_id=args.model_id,
+        question_file=question_file,
+        question_begin=args.question_begin,
+        question_end=args.question_end,
+        answer_file=answer_file,
+        max_new_token=args.max_new_token,
+        num_choices=args.num_choices,
+        num_gpus_per_model=args.num_gpus_per_model,
+        num_gpus_total=args.num_gpus_total,
+        max_gpu_memory=args.max_gpu_memory,
+        dtype=str_to_torch_dtype(args.dtype),
     )
 
     reorg_answer_file(answer_file)
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index a3b2632db..72ef6f918 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -152,6 +152,7 @@ def load_model(
     device: str = "cuda",
     num_gpus: int = 1,
     max_gpu_memory: Optional[str] = None,
+    dtype: Optional[torch.dtype] = None,
     load_8bit: bool = False,
     cpu_offloading: bool = False,
     gptq_config: Optional[GptqConfig] = None,
@@ -282,6 +283,9 @@ def load_model(
         return model, tokenizer
     kwargs["revision"] = revision
 
+    if dtype is not None:  # Overwrite dtype if it is provided in the arguments.
+        kwargs["torch_dtype"] = dtype
+
     # Load model
     model, tokenizer = adapter.load_model(model_path, kwargs)
 
@@ -393,6 +397,13 @@ def add_model_args(parser):
         type=str,
         help="The maximum memory per GPU for storing model weights. Use a string like '13Gib'",
     )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16"],
+        help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
+        default=None,
+    )
     parser.add_argument(
         "--load-8bit", action="store_true", help="Use 8-bit quantization"
     )
diff --git a/fastchat/serve/cli.py b/fastchat/serve/cli.py
index 41161ae35..de52a44bd 100644
--- a/fastchat/serve/cli.py
+++ b/fastchat/serve/cli.py
@@ -26,11 +26,13 @@
 from rich.console import Console
 from rich.live import Live
 from rich.markdown import Markdown
+import torch
 
 from fastchat.model.model_adapter import add_model_args
 from fastchat.modules.gptq import GptqConfig
 from fastchat.modules.awq import AWQConfig
 from fastchat.serve.inference import ChatIO, chat_loop
+from fastchat.utils import str_to_torch_dtype
 
 
 class SimpleChatIO(ChatIO):
@@ -208,6 +210,7 @@ def main(args):
             args.device,
             args.num_gpus,
             args.max_gpu_memory,
+            str_to_torch_dtype(args.dtype),
             args.load_8bit,
             args.cpu_offloading,
             args.conv_template,
diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
index dac10d542..169f086b9 100644
--- a/fastchat/serve/inference.py
+++ b/fastchat/serve/inference.py
@@ -291,6 +291,7 @@ def chat_loop(
     device: str,
     num_gpus: int,
     max_gpu_memory: str,
+    dtype: Optional[torch.dtype],
     load_8bit: bool,
     cpu_offloading: bool,
     conv_template: Optional[str],
@@ -312,6 +313,7 @@ def chat_loop(
         device=device,
         num_gpus=num_gpus,
         max_gpu_memory=max_gpu_memory,
+        dtype=dtype,
         load_8bit=load_8bit,
         cpu_offloading=cpu_offloading,
         gptq_config=gptq_config,
diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py
index 470bc5eea..54d51cfd0 100644
--- a/fastchat/serve/model_worker.py
+++ b/fastchat/serve/model_worker.py
@@ -34,6 +34,7 @@
     )
 import torch
 import torch.nn.functional as F
+from transformers import set_seed
 import uvicorn
 
 from fastchat.constants import WORKER_HEART_BEAT_INTERVAL, ErrorCode, SERVER_ERROR_MSG
@@ -46,7 +47,12 @@
 )
 from fastchat.modules.gptq import GptqConfig
 from fastchat.modules.awq import AWQConfig
-from fastchat.utils import build_logger, pretty_print_semaphore, get_context_length
+from fastchat.utils import (
+    build_logger,
+    pretty_print_semaphore,
+    get_context_length,
+    str_to_torch_dtype,
+)
 
 
 worker_id = str(uuid.uuid4())[:8]
@@ -190,13 +196,15 @@ def __init__(
         device: str,
         num_gpus: int,
         max_gpu_memory: str,
+        dtype: Optional[torch.dtype] = None,
         load_8bit: bool = False,
         cpu_offloading: bool = False,
         gptq_config: Optional[GptqConfig] = None,
         awq_config: Optional[AWQConfig] = None,
         stream_interval: int = 2,
-        conv_template: str = None,
+        conv_template: Optional[str] = None,
         embed_in_truncate: bool = False,
+        seed: Optional[int] = None,
         **kwargs,
     ):
         super().__init__(
@@ -215,6 +223,7 @@ def __init__(
             device=device,
             num_gpus=num_gpus,
             max_gpu_memory=max_gpu_memory,
+            dtype=dtype,
             load_8bit=load_8bit,
             cpu_offloading=cpu_offloading,
             gptq_config=gptq_config,
@@ -227,6 +236,7 @@ def __init__(
         self.generate_stream_func = get_generate_stream_function(self.model, model_path)
         self.stream_interval = stream_interval
         self.embed_in_truncate = embed_in_truncate
+        self.seed = seed
 
         if not no_register:
             self.init_heart_beat()
@@ -235,6 +245,8 @@ def generate_stream_gate(self, params):
         self.call_ct += 1
 
         try:
+            if self.seed is not None:
+                set_seed(self.seed)
             for output in self.generate_stream_func(
                 self.model,
                 self.tokenizer,
@@ -475,6 +487,12 @@ def create_model_worker():
     )
     parser.add_argument("--stream-interval", type=int, default=2)
     parser.add_argument("--no-register", action="store_true")
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Overwrite the random seed for each generation.",
+    )
     args = parser.parse_args()
     logger.info(f"args: {args}")
 
@@ -508,6 +526,7 @@ def create_model_worker():
         device=args.device,
         num_gpus=args.num_gpus,
         max_gpu_memory=args.max_gpu_memory,
+        dtype=str_to_torch_dtype(args.dtype),
         load_8bit=args.load_8bit,
         cpu_offloading=args.cpu_offloading,
         gptq_config=gptq_config,
@@ -515,6 +534,7 @@ def create_model_worker():
         stream_interval=args.stream_interval,
         conv_template=args.conv_template,
         embed_in_truncate=args.embed_in_truncate,
+        seed=args.seed,
     )
     return args, worker
 
diff --git a/fastchat/utils.py b/fastchat/utils.py
index 25370eb17..e2d3a6ac6 100644
--- a/fastchat/utils.py
+++ b/fastchat/utils.py
@@ -302,3 +302,18 @@ def get_context_length(config):
         if val is not None:
             return int(rope_scaling_factor * val)
     return 2048
+
+
+def str_to_torch_dtype(dtype: str):
+    import torch
+
+    if dtype is None:
+        return None
+    elif dtype == "float32":
+        return torch.float32
+    elif dtype == "float16":
+        return torch.float16
+    elif dtype == "bfloat16":
+        return torch.bfloat16
+    else:
+        raise ValueError(f"Unrecognized dtype: {dtype}")

From 24acac171cdfc9413b2adf9a7d9219afa42a6b8d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 17 Sep 2023 20:24:06 -0700
Subject: [PATCH 36/45] Data cleaning scripts for dataset release (#2440)

---
 docs/commands/leaderboard.md                  |  13 +-
 fastchat/data/merge.py                        |   1 -
 fastchat/serve/monitor/clean_battle_data.py   |  19 +-
 fastchat/serve/monitor/clean_chat_data.py     |   6 +-
 .../arena_33k}/count_unique_users.py          |   0
 .../arena_33k}/filter_bad_conv.py             |   0
 .../arena_33k}/merge_field.py                 |   0
 .../arena_33k}/sample.py                      |   0
 .../arena_33k}/upload_hf_dataset.py           |   0
 .../lmsys_chat_1m/compute_stats.py            | 119 ++++++++
 .../lmsys_chat_1m/filter_bad_conv.py          | 148 ++++++++++
 .../lmsys_chat_1m/final_post_processing.py    |  27 ++
 .../lmsys_chat_1m/instructions.md             |  23 ++
 .../lmsys_chat_1m/merge_oai_tag.py            |  45 +++
 .../lmsys_chat_1m/process_all.sh              |  18 ++
 .../lmsys_chat_1m/upload_hf_dataset.py        |  17 ++
 .../serve/monitor/hf_space_leaderboard_app.py | 258 ------------------
 fastchat/serve/monitor/intersect_conv_file.py |  25 ++
 fastchat/serve/monitor/monitor.py             |  14 +-
 fastchat/serve/monitor/replace_model_name.py  |  21 --
 fastchat/serve/monitor/summarize_cluster.py   |   1 -
 21 files changed, 457 insertions(+), 298 deletions(-)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/count_unique_users.py (100%)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/filter_bad_conv.py (100%)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/merge_field.py (100%)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/sample.py (100%)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/upload_hf_dataset.py (100%)
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/instructions.md
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/merge_oai_tag.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/upload_hf_dataset.py
 delete mode 100644 fastchat/serve/monitor/hf_space_leaderboard_app.py
 create mode 100644 fastchat/serve/monitor/intersect_conv_file.py
 delete mode 100644 fastchat/serve/monitor/replace_model_name.py

diff --git a/docs/commands/leaderboard.md b/docs/commands/leaderboard.md
index d06aa1a05..0a668f649 100644
--- a/docs/commands/leaderboard.md
+++ b/docs/commands/leaderboard.md
@@ -11,5 +11,16 @@ python3 clean_battle_data.py
 
 ### Run Elo analysis
 ```
-python3 elo_analysis.py --clean-battle-file clean_battle_20230523.json
+python3 elo_analysis.py --clean-battle-file clean_battle_20230905.json
+```
+
+### Copy files to HF space
+1. update plots
+```
+scp atlas:/data/lmzheng/FastChat/fastchat/serve/monitor/elo_results_20230905.pkl .
+```
+
+2. update table
+```
+wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/raw/main/leaderboard_table_20230905.csv
 ```
diff --git a/fastchat/data/merge.py b/fastchat/data/merge.py
index 044401315..0ae63ea76 100644
--- a/fastchat/data/merge.py
+++ b/fastchat/data/merge.py
@@ -6,7 +6,6 @@
 
 import argparse
 import json
-from typing import Dict, Sequence, Optional
 
 
 if __name__ == "__main__":
diff --git a/fastchat/serve/monitor/clean_battle_data.py b/fastchat/serve/monitor/clean_battle_data.py
index 63c8e565b..4cab1af42 100644
--- a/fastchat/serve/monitor/clean_battle_data.py
+++ b/fastchat/serve/monitor/clean_battle_data.py
@@ -34,6 +34,7 @@
     "palm",
     "lamda",
     "google",
+    "llama",
     "NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.",
 ]
 
@@ -43,11 +44,7 @@
 
 def get_log_files(max_num_files=None):
     dates = []
-    for month in [4, 5, 6, 7]:
-        for day in range(1, 32):
-            dates.append(f"2023-{month:02d}-{day:02d}")
-
-    for month in [8]:
+    for month in [4, 5, 6, 7, 8, 9]:
         for day in range(1, 32):
             dates.append(f"2023-{month:02d}-{day:02d}")
 
@@ -85,7 +82,7 @@ def replace_model_name(old_name):
     )
 
 
-def clean_battle_data(log_files):
+def clean_battle_data(log_files, exclude_model_names):
     data = []
     for filename in tqdm(log_files, desc="read files"):
         for retry in range(5):
@@ -173,6 +170,11 @@ def clean_battle_data(log_files):
         # Replace bard with palm
         models = [replace_model_name(m) for m in models]
 
+        # Exclude certain models
+        if any(x in exclude_model_names for x in models):
+            ct_invalid += 1
+            continue
+
         question_id = row["states"][0]["conv_id"]
         conversation_a = to_openai_format(
             row["states"][0]["messages"][row["states"][0]["offset"] :]
@@ -186,7 +188,7 @@ def clean_battle_data(log_files):
             all_ips[ip] = len(all_ips)
         user_id = all_ips[ip]
 
-        # Save the result
+        # Save the results
         battles.append(
             dict(
                 question_id=question_id,
@@ -228,10 +230,11 @@ def clean_battle_data(log_files):
     parser.add_argument(
         "--mode", type=str, choices=["simple", "conv_release"], default="simple"
     )
+    parser.add_argument("--exclude-model-names", type=str, nargs="+")
     args = parser.parse_args()
 
     log_files = get_log_files(args.max_num_files)
-    battles = clean_battle_data(log_files)
+    battles = clean_battle_data(log_files, args.exclude_model_names or [])
     last_updated_tstamp = battles[-1]["tstamp"]
     cutoff_date = datetime.datetime.fromtimestamp(
         last_updated_tstamp, tz=timezone("US/Pacific")
diff --git a/fastchat/serve/monitor/clean_chat_data.py b/fastchat/serve/monitor/clean_chat_data.py
index 54e7b3e39..86d15bac2 100644
--- a/fastchat/serve/monitor/clean_chat_data.py
+++ b/fastchat/serve/monitor/clean_chat_data.py
@@ -28,11 +28,7 @@
 
 def get_log_files(max_num_files=None):
     dates = []
-    for month in [4, 5, 6, 7]:
-        for day in range(1, 32):
-            dates.append(f"2023-{month:02d}-{day:02d}")
-
-    for month in [8]:
+    for month in [4, 5, 6, 7, 8, 9, 10]:
         for day in range(1, 32):
             dates.append(f"2023-{month:02d}-{day:02d}")
 
diff --git a/fastchat/serve/monitor/conv_release_scripts/count_unique_users.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/count_unique_users.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/count_unique_users.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/count_unique_users.py
diff --git a/fastchat/serve/monitor/conv_release_scripts/filter_bad_conv.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/filter_bad_conv.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/filter_bad_conv.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/filter_bad_conv.py
diff --git a/fastchat/serve/monitor/conv_release_scripts/merge_field.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/merge_field.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/merge_field.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/merge_field.py
diff --git a/fastchat/serve/monitor/conv_release_scripts/sample.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/sample.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/sample.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/sample.py
diff --git a/fastchat/serve/monitor/conv_release_scripts/upload_hf_dataset.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/upload_hf_dataset.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/upload_hf_dataset.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/upload_hf_dataset.py
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py
new file mode 100644
index 000000000..97abaaa0d
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py
@@ -0,0 +1,119 @@
+"""
+From colab:
+https://colab.research.google.com/drive/1oMdw_Lqgmd6DletSOLHsyD-Rc96cRShs?usp=sharing
+"""
+import argparse
+import datetime
+import json
+import os
+from pytz import timezone
+import time
+
+import kaleido
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from tqdm import tqdm
+
+import plotly.io as pio
+
+pio.kaleido.scope.mathjax = None
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--in-file", type=str, required=True)
+parser.add_argument("--scale", type=int, required=True)
+args = parser.parse_args()
+
+filename = args.in_file
+scale = args.scale
+convs = json.load(open(filename))
+df = pd.DataFrame(convs)
+df
+
+print(f"#ips: {df['user_id'].nunique() * scale}")
+print(f"#models: {df['model'].nunique()}")
+print(f"#language: {df['language'].nunique()}")
+print(f"#turns: {df['turn'].mean()}")
+
+model_counts = df["model"].value_counts() * scale
+# print("model counts", model_counts)
+fig = px.bar(x=model_counts.index, y=model_counts)
+fig.update_layout(
+    xaxis_title=None,
+    yaxis_title="Count",
+    height=200,
+    width=950,
+    margin=dict(l=0, r=0, t=0, b=0),
+)
+fig.show()
+fig.write_image("model_count.pdf")
+
+
+model_counts = df["language"].value_counts().head(25) * scale
+fig = px.bar(x=model_counts.index, y=model_counts)
+fig.update_layout(
+    xaxis_title=None,
+    yaxis_title="Count",
+    height=200,
+    width=950,
+    margin=dict(l=0, r=0, t=0, b=0),
+)
+fig.show()
+fig.write_image("language_count.pdf")
+
+chat_dates = [
+    datetime.datetime.fromtimestamp(x, tz=timezone("US/Pacific")).strftime("%Y-%m-%d")
+    for x in df["tstamp"]
+]
+
+
+def to_remove(x):
+    for d in ["08-09", "08-08", "08-07", "08-06", "08-05", "08-04"]:
+        if d in x:
+            return True
+    return False
+
+
+chat_dates = [x for x in chat_dates if not to_remove(x)]
+
+chat_dates_counts = pd.value_counts(chat_dates) * scale
+print(f"mean #chat per day: {np.mean(chat_dates_counts):.2f}")
+
+fig = px.bar(x=chat_dates_counts.index, y=chat_dates_counts)
+fig.update_layout(
+    xaxis_title="Dates",
+    yaxis_title="Count",
+    height=200,
+    width=950,
+    margin=dict(l=0, r=0, t=0, b=0),
+)
+fig.show()
+fig.write_image("daily_conversation_count.pdf")
+
+import transformers
+
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    "lmsys/vicuna-7b-v1.5", use_fast=False
+)
+
+prompts = []
+responses = []
+for conv in df["conversation"]:
+    for row in conv:
+        if row["role"] == "user":
+            prompts.append(row["content"])
+        else:
+            responses.append(row["content"])
+
+print(f"#prompts: {len(prompts)}")
+print(f"#responses: {len(responses)}")
+
+
+prompt_lens = [len(tokenizer(x).input_ids) for x in tqdm(prompts)]
+print()
+print(f"mean prompt len: {np.mean(prompt_lens):.2f}")
+
+response_lens = [len(tokenizer(x).input_ids) if x else 0 for x in tqdm(responses)]
+print()
+print(f"mean response len: {np.mean(response_lens):.2f}")
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py
new file mode 100644
index 000000000..3ccde1ca5
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py
@@ -0,0 +1,148 @@
+"""
+Filter conversations for release.
+
+Dependency:
+pip install opencc-python-reimplementedpip install opencc-python-reimplemented
+
+Usage:
+python3 filter_bad_conv_lmsys_chat_1m.py --in clean_battle_conv_20230630_tagged_v1_pii.json
+"""
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+from collections import defaultdict
+from enum import Enum, auto
+import json
+import os
+import random
+
+from tqdm import tqdm
+import opencc
+
+BLOCKED_WORDS_FILENAME = "blocked_words.json"
+blocked_words = []
+frequency = defaultdict(lambda: 0)
+
+cc_converter = opencc.OpenCC("t2s")
+
+
+class TypeCode(Enum):
+    CORRECT = auto()
+    ANONYMIZED = auto()
+    REDACTED = auto()
+    BAD_FORMAT = auto()
+    BLOCKED_WORD = auto()
+    BLOCKED_MODEL = auto()
+    TOO_SHORT = auto()
+    TOO_FREQUENT = auto()
+
+
+def detect_type(conv):
+    for key in ["conversation_a", "conversation_b", "conversation"]:
+        if key not in conv:
+            continue
+
+        messages = [row["content"] for row in conv[key]]
+        for msg in messages:
+            if not isinstance(msg, str):
+                return TypeCode.BAD_FORMAT
+
+        if len(messages) == 0:
+            return TypeCode.BAD_FORMAT
+
+        user_prompts = [
+            row["content"].lower().strip() for row in conv[key] if row["role"] == "user"
+        ]
+
+        for msg in messages:
+            msg = cc_converter.convert(msg.lower())
+            if "<anonymized>" in msg:
+                return TypeCode.ANONYMIZED
+            if "<redacted>" in msg:
+                return TypeCode.REDACTED
+
+            for w in blocked_words:
+                if w in msg:
+                    return TypeCode.BLOCKED_WORD
+
+    return TypeCode.CORRECT
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--sample", type=int)
+    args = parser.parse_args()
+
+    # Read conversations
+    convs = json.load(open(args.in_file))
+    print(f"#conv: {len(convs)}")
+
+    # Read blocked words
+    if os.path.exists(BLOCKED_WORDS_FILENAME):
+        blocked_words = json.load(open(BLOCKED_WORDS_FILENAME))
+        blocked_words = [cc_converter.convert(w) for w in blocked_words]
+
+    # Start filter
+    ct_bad_format = 0
+    ct_anonymized = 0
+    ct_redacted = 0
+    ct_error = 0
+    ct_lang_filter = 0
+    ct_flagged = 0
+    ct_blocked_word = 0
+    ct_blocked_model = 0
+    ct_too_short = 0
+    ct_too_frequent = 0
+
+    type_codes = []
+    with ProcessPoolExecutor() as executor:
+        for result in tqdm(executor.map(detect_type, convs), total=len(convs)):
+            type_codes.append(result)
+
+    new_convs = []
+    for conv, type_code in zip(convs, type_codes):
+        if type_code == TypeCode.BAD_FORMAT:
+            ct_bad_format += 1
+            continue
+
+        if type_code == TypeCode.ANONYMIZED:
+            ct_anonymized += 1
+            continue
+        elif type_code == TypeCode.REDACTED:
+            ct_redacted += 1
+            continue
+        elif type_code == TypeCode.BLOCKED_WORD:
+            ct_blocked_word += 1
+            continue
+        elif type_code == TypeCode.BLOCKED_MODEL:
+            ct_blocked_model += 1
+            continue
+        elif type_code == TypeCode.TOO_SHORT:
+            ct_too_short += 1
+            continue
+        elif type_code == TypeCode.TOO_FREQUENT:
+            ct_too_frequent += 1
+            continue
+
+        if "openai_moderation" in conv and conv["openai_moderation"]["flagged"]:
+            ct_flagged += 1
+            continue
+
+        if type_code in [TypeCode.CORRECT]:
+            new_convs.append(conv)
+
+    if args.sample:
+        random.seed(42)
+        random.shuffle(new_convs)
+        new_convs = new_convs[: args.sample]
+
+    print(f"ct_anonymized: {ct_anonymized}, ct_redacted: {ct_redacted}")
+    print(f"ct_bad_format: {ct_bad_format}, ct_flagged: {ct_flagged}")
+    print(f"ct_blocked_word: {ct_blocked_word}, ct_blocked_model: {ct_blocked_model}")
+    print(f"ct_too_short: {ct_too_short}, ct_too_frequent: {ct_too_frequent}")
+    print(f"new_conv: {len(new_convs)}")
+
+    out_file = args.in_file.replace(".json", ".s1.json")
+    print(f"Output to {out_file}")
+    with open(out_file, "w") as fout:
+        json.dump(new_convs, fout, indent=2, ensure_ascii=False)
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py
new file mode 100644
index 000000000..e368e92a1
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py
@@ -0,0 +1,27 @@
+import argparse
+import json
+
+from tqdm import tqdm
+import numpy as np
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    args = parser.parse_args()
+
+    # Read conversations
+    convs = json.load(open(args.in_file))
+    print(f"#conv: {len(convs)}")
+
+    # Delete some fileds
+    for c in convs:
+        del c["tstamp"]
+        del c["user_id"]
+
+    # Write
+    print(f"#out conv: {len(convs)}")
+    out_file = args.in_file.replace(".json", ".s2.json")
+    print(f"Output to {out_file}")
+    with open(out_file, "w") as fout:
+        json.dump(convs, fout, indent=2, ensure_ascii=False)
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/instructions.md b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/instructions.md
new file mode 100644
index 000000000..4c439731f
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/instructions.md
@@ -0,0 +1,23 @@
+```
+export BASE=clean_conv_20230809_100k_pii
+export SCALE=10
+
+# filter words
+python3 filter_bad_conv.py --in $BASE.json
+
+# Clean up some fileds (e.g., timestamps)
+python3 final_post_processing.py --in $BASE.s1.json
+
+# upload to hf
+python3 upload_hf_dataset.py --in $BASE.s1.s2.json
+
+# Make another version with openai moderation tag
+python3 merge_oai_tag.py --in $BASE.s1.s2.json
+
+# Make visualizations
+python3 compute_stats.py --in $BASE.s1.json --scale $SCALE
+
+# Copy figures
+scp "atlas:/data/lmzheng/FastChat/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/*.pdf" .
+```
+
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/merge_oai_tag.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/merge_oai_tag.py
new file mode 100644
index 000000000..18bef5f19
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/merge_oai_tag.py
@@ -0,0 +1,45 @@
+import argparse
+import json
+import time
+
+from tqdm import tqdm
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--sample", type=int)
+    args = parser.parse_args()
+
+    tag_file = "clean_conv_20230809_1.5M_oai_filter_v2.json"
+    # tag_file = "clean_conv_20230809_1.5M_oai_filter_v2_100k.json"
+    in_file = args.in_file
+    tic = time.time()
+
+    # Load tags
+    print("Load tags...")
+    tag_data = json.load(open(tag_file))
+    tag_dict = {}
+    for c in tqdm(tag_data):
+        tag_dict[c["conversation_id"]] = [x["oai_filter"] for x in c["conversation"]]
+    print(f"elapsed: {time.time() - tic:.2f} s")
+
+    # Append to input_file
+    print("Load inputs...")
+    input_data = json.load(open(in_file))
+    for c in tqdm(input_data):
+        cid = c["conversation_id"]
+        if cid in tag_dict:
+            c["openai_moderation"] = tag_dict[cid]
+        else:
+            print(f"missing tag for conv {cid}")
+            exit()
+    print(f"elapsed: {time.time() - tic:.2f} s")
+
+    # Write output
+    print("Write outputs...")
+    out_file = in_file.replace(".json", ".with_tag.json")
+    print(f"Output to {out_file}")
+    with open(out_file, "w") as fout:
+        json.dump(input_data, fout, indent=2, ensure_ascii=False)
+    print(f"elapsed: {time.time() - tic:.2f} s")
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh
new file mode 100644
index 000000000..5bae9fbad
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh
@@ -0,0 +1,18 @@
+export BASE=clean_conv_20230809_1.5M_pii
+#export BASE=clean_conv_20230809_100k_pii
+export SCALE=1
+
+# Filter words
+python3 filter_bad_conv.py --in $BASE.json --sample 1000000
+
+# Clean up some fileds (e.g., timestamps)
+python3 final_post_processing.py --in $BASE.s1.json
+
+# Upload to hf
+python3 upload_hf_dataset.py --in $BASE.s1.s2.json
+
+# Make another version with openai moderation tag
+python3 merge_oai_tag.py --in $BASE.s1.s2.json
+
+# Make visualizations
+python3 compute_stats.py --in $BASE.s1.json --scale $SCALE
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/upload_hf_dataset.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/upload_hf_dataset.py
new file mode 100644
index 000000000..41d0fbdb5
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/upload_hf_dataset.py
@@ -0,0 +1,17 @@
+"""
+Upload to huggingface.
+"""
+import argparse
+import json
+from datasets import Dataset, DatasetDict, load_dataset
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    args = parser.parse_args()
+
+    objs = json.load(open(args.in_file))
+    print(f"#convs: {len(objs)}")
+    data = Dataset.from_list(objs)
+    data.push_to_hub("lmsys/lmsys-chat-1m", private=True)
diff --git a/fastchat/serve/monitor/hf_space_leaderboard_app.py b/fastchat/serve/monitor/hf_space_leaderboard_app.py
deleted file mode 100644
index 8fb21fbdc..000000000
--- a/fastchat/serve/monitor/hf_space_leaderboard_app.py
+++ /dev/null
@@ -1,258 +0,0 @@
-"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
-import ast
-import argparse
-import pickle
-
-import gradio as gr
-import numpy as np
-
-
-notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
-
-
-basic_component_values = [None] * 6
-leader_component_values = [None] * 5
-
-
-def make_leaderboard_md(elo_results):
-    leaderboard_md = f"""
-# Leaderboard
-| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
-
-🏆 This leaderboard is based on the following three benchmarks.
-- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 40K+ user votes to compute Elo ratings.
-- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
-- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
-
-💻 We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10) and win rates (against gpt-3.5). The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
-"""
-    return leaderboard_md
-
-
-def make_leaderboard_md_live(elo_results):
-    leaderboard_md = f"""
-# Leaderboard
-Last updated: {elo_results["last_updated_datetime"]}
-{elo_results["leaderboard_table"]}
-"""
-    return leaderboard_md
-
-
-def update_elo_components(max_num_files, elo_results_file):
-    log_files = get_log_files(max_num_files)
-
-    # Leaderboard
-    if elo_results_file is None:  # Do live update
-        battles = clean_battle_data(log_files)
-        elo_results = report_elo_analysis_results(battles)
-
-        leader_component_values[0] = make_leaderboard_md_live(elo_results)
-        leader_component_values[1] = elo_results["win_fraction_heatmap"]
-        leader_component_values[2] = elo_results["battle_count_heatmap"]
-        leader_component_values[3] = elo_results["bootstrap_elo_rating"]
-        leader_component_values[4] = elo_results["average_win_rate_bar"]
-
-    # Basic stats
-    basic_stats = report_basic_stats(log_files)
-    md0 = f"Last updated: {basic_stats['last_updated_datetime']}"
-
-    md1 = "### Action Histogram\n"
-    md1 += basic_stats["action_hist_md"] + "\n"
-
-    md2 = "### Anony. Vote Histogram\n"
-    md2 += basic_stats["anony_vote_hist_md"] + "\n"
-
-    md3 = "### Model Call Histogram\n"
-    md3 += basic_stats["model_hist_md"] + "\n"
-
-    md4 = "### Model Call (Last 24 Hours)\n"
-    md4 += basic_stats["num_chats_last_24_hours"] + "\n"
-
-    basic_component_values[0] = md0
-    basic_component_values[1] = basic_stats["chat_dates_bar"]
-    basic_component_values[2] = md1
-    basic_component_values[3] = md2
-    basic_component_values[4] = md3
-    basic_component_values[5] = md4
-
-
-def update_worker(max_num_files, interval, elo_results_file):
-    while True:
-        tic = time.time()
-        update_elo_components(max_num_files, elo_results_file)
-        durtaion = time.time() - tic
-        print(f"update duration: {durtaion:.2f} s")
-        time.sleep(max(interval - durtaion, 0))
-
-
-def load_demo(url_params, request: gr.Request):
-    logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
-    return basic_component_values + leader_component_values
-
-
-def model_hyperlink(model_name, link):
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-
-
-def load_leaderboard_table_csv(filename, add_hyperlink=True):
-    lines = open(filename).readlines()
-    heads = [v.strip() for v in lines[0].split(",")]
-    rows = []
-    for i in range(1, len(lines)):
-        row = [v.strip() for v in lines[i].split(",")]
-        for j in range(len(heads)):
-            item = {}
-            for h, v in zip(heads, row):
-                if h == "Arena Elo rating":
-                    if v != "-":
-                        v = int(ast.literal_eval(v))
-                    else:
-                        v = np.nan
-                elif h == "MMLU":
-                    if v != "-":
-                        v = round(ast.literal_eval(v) * 100, 1)
-                    else:
-                        v = np.nan
-                elif h == "MT-bench (win rate %)":
-                    if v != "-":
-                        v = round(ast.literal_eval(v[:-1]), 1)
-                    else:
-                        v = np.nan
-                elif h == "MT-bench (score)":
-                    if v != "-":
-                        v = round(ast.literal_eval(v), 2)
-                    else:
-                        v = np.nan
-                item[h] = v
-            if add_hyperlink:
-                item["Model"] = model_hyperlink(item["Model"], item["Link"])
-        rows.append(item)
-
-    return rows
-
-
-def build_basic_stats_tab():
-    empty = "Loading ..."
-    basic_component_values[:] = [empty, None, empty, empty, empty, empty]
-
-    md0 = gr.Markdown(empty)
-    gr.Markdown("#### Figure 1: Number of model calls and votes")
-    plot_1 = gr.Plot(show_label=False)
-    with gr.Row():
-        with gr.Column():
-            md1 = gr.Markdown(empty)
-        with gr.Column():
-            md2 = gr.Markdown(empty)
-    with gr.Row():
-        with gr.Column():
-            md3 = gr.Markdown(empty)
-        with gr.Column():
-            md4 = gr.Markdown(empty)
-    return [md0, plot_1, md1, md2, md3, md4]
-
-
-def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
-    if elo_results_file is None:  # Do live update
-        md = "Loading ..."
-        p1 = p2 = p3 = p4 = None
-    else:
-        with open(elo_results_file, "rb") as fin:
-            elo_results = pickle.load(fin)
-
-        md = make_leaderboard_md(elo_results)
-        p1 = elo_results["win_fraction_heatmap"]
-        p2 = elo_results["battle_count_heatmap"]
-        p3 = elo_results["bootstrap_elo_rating"]
-        p4 = elo_results["average_win_rate_bar"]
-
-    md_1 = gr.Markdown(md, elem_id="leaderboard_markdown")
-
-    if leaderboard_table_file:
-        data = load_leaderboard_table_csv(leaderboard_table_file)
-        headers = [
-            "Model",
-            "Arena Elo rating",
-            "MT-bench (score)",
-            "MT-bench (win rate %)",
-            "MMLU",
-            "License",
-        ]
-        values = []
-        for item in data:
-            row = []
-            for key in headers:
-                value = item[key]
-                row.append(value)
-            values.append(row)
-        values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
-
-        headers[1] = "⭐ " + headers[1]
-        headers[2] = "📈 " + headers[2]
-
-        gr.Dataframe(
-            headers=headers,
-            datatype=["markdown", "number", "number", "number", "number", "str"],
-            value=values,
-            elem_id="leaderboard_dataframe",
-        )
-        gr.Markdown(
-            "If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model)."
-        )
-    else:
-        pass
-
-    gr.Markdown(
-        f"""## More Statistics for Chatbot Arena\n
-We added some additional figures to show more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
-Please note that you may see different orders from different ranking methods. This is expected for models that perform similarly, as demonstrated by the confidence interval in the bootstrap figure. Going forward, we prefer the classical Elo calculation because of its scalability and interpretability. You can find more discussions in this blog [post](https://lmsys.org/blog/2023-05-03-arena/).
-"""
-    )
-
-    leader_component_values[:] = [md, p1, p2, p3, p4]
-
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown(
-                "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
-            )
-            plot_1 = gr.Plot(p1, show_label=False)
-        with gr.Column():
-            gr.Markdown(
-                "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
-            )
-            plot_2 = gr.Plot(p2, show_label=False)
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown(
-                "#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
-            )
-            plot_3 = gr.Plot(p3, show_label=False)
-        with gr.Column():
-            gr.Markdown(
-                "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
-            )
-            plot_4 = gr.Plot(p4, show_label=False)
-    return [md_1, plot_1, plot_2, plot_3, plot_4]
-
-
-def build_demo(elo_results_file, leaderboard_table_file):
-    text_size = gr.themes.sizes.text_lg
-
-    with gr.Blocks(
-        title="Chatbot Arena Leaderboard",
-        theme=gr.themes.Base(text_size=text_size),
-    ) as demo:
-        leader_components = build_leaderboard_tab(
-            elo_results_file, leaderboard_table_file
-        )
-
-    return demo
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--share", action="store_true")
-    args = parser.parse_args()
-
-    demo = build_demo("elo_results_20230619.pkl", "leaderboard_table_20230619.csv")
-    demo.launch(share=args.share)
diff --git a/fastchat/serve/monitor/intersect_conv_file.py b/fastchat/serve/monitor/intersect_conv_file.py
new file mode 100644
index 000000000..9eadd7cd5
--- /dev/null
+++ b/fastchat/serve/monitor/intersect_conv_file.py
@@ -0,0 +1,25 @@
+"""
+Take the intersection of two conversation files.
+
+Usage: python3 -m fastchat.data.merge --input input.json --conv-id conv_id_file.json --out intersect.json
+"""
+
+import argparse
+import json
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str, required=True)
+    parser.add_argument("--conv-id", type=str, required=True)
+    parser.add_argument("--out-file", type=str, default="intersect.json")
+    args = parser.parse_args()
+
+    conv_id_objs = json.load(open(args.conv_id, "r"))
+    conv_ids = set(x["conversation_id"] for x in conv_id_objs)
+
+    objs = json.load(open(args.input, "r"))
+    after_objs = [x for x in objs if x["conversation_id"] in conv_ids]
+
+    print(f"#in: {len(objs)}, #out: {len(after_objs)}")
+    json.dump(after_objs, open(args.out_file, "w"), indent=2, ensure_ascii=False)
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
index d576b42b0..5efe8486c 100644
--- a/fastchat/serve/monitor/monitor.py
+++ b/fastchat/serve/monitor/monitor.py
@@ -1,5 +1,10 @@
-# sudo apt install pkg-config libicu-dev
-# pip install pytz gradio gdown plotly polyglot pyicu pycld2 tabulate
+"""
+Live monitor of the website statistics and leaderboard.
+
+Dependency:
+sudo apt install pkg-config libicu-dev
+pip install pytz gradio gdown plotly polyglot pyicu pycld2 tabulate
+"""
 
 import argparse
 import ast
@@ -53,7 +58,7 @@ def update_elo_components(max_num_files, elo_results_file):
 
     # Leaderboard
     if elo_results_file is None:  # Do live update
-        battles = clean_battle_data(log_files)
+        battles = clean_battle_data(log_files, [])
         elo_results = report_elo_analysis_results(battles)
 
         leader_component_values[0] = make_leaderboard_md_live(elo_results)
@@ -250,11 +255,14 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
 
 
 def build_demo(elo_results_file, leaderboard_table_file):
+    from fastchat.serve.gradio_web_server import block_css
+
     text_size = gr.themes.sizes.text_lg
 
     with gr.Blocks(
         title="Monitor",
         theme=gr.themes.Base(text_size=text_size),
+        css=block_css,
     ) as demo:
         with gr.Tabs() as tabs:
             with gr.Tab("Leaderboard", id=0):
diff --git a/fastchat/serve/monitor/replace_model_name.py b/fastchat/serve/monitor/replace_model_name.py
deleted file mode 100644
index ff2667e2f..000000000
--- a/fastchat/serve/monitor/replace_model_name.py
+++ /dev/null
@@ -1,21 +0,0 @@
-"""
-Usage:
-python3 replace_model_name.py --in clean_conv_20230809_10k.json
-"""
-
-import argparse
-import json
-
-from fastchat.serve.monitor.clean_battle_data import replace_model_name
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--in-file", type=str, required=True)
-    args = parser.parse_args()
-
-    convs = json.load(open(args.in_file))
-    for x in convs:
-        x["model"] = replace_model_name(x["model"])
-
-    with open(args.in_file, "w") as fout:
-        json.dump(convs, fout, indent=2, ensure_ascii=False)
diff --git a/fastchat/serve/monitor/summarize_cluster.py b/fastchat/serve/monitor/summarize_cluster.py
index 4ca7f48d2..aa53c265d 100644
--- a/fastchat/serve/monitor/summarize_cluster.py
+++ b/fastchat/serve/monitor/summarize_cluster.py
@@ -1,5 +1,4 @@
 """
-
 Usage:
 python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4
 """

From 30a6ffc7fcd90bd1037f5a60cbae5b4b6986fc93 Mon Sep 17 00:00:00 2001
From: "Jeff (Zhen) Wang" <wangzhen263@gmail.com>
Date: Tue, 19 Sep 2023 06:18:38 +1000
Subject: [PATCH 37/45] merge google/flan based adapters: T5Adapter,
 CodeT5pAdapter, FlanAdapter (#2411)

---
 fastchat/model/model_adapter.py | 40 +++++++--------------------------
 1 file changed, 8 insertions(+), 32 deletions(-)

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 72ef6f918..e4f049ed3 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -23,7 +23,6 @@
     AutoTokenizer,
     LlamaTokenizer,
     LlamaForCausalLM,
-    T5Tokenizer,
 )
 
 from fastchat.constants import CPU_ISA
@@ -31,9 +30,7 @@
 from fastchat.modules.awq import AWQConfig, load_awq_quantized
 from fastchat.conversation import Conversation, get_conv_template
 from fastchat.model.compression import load_compress_model
-from fastchat.model.llama_condense_monkey_patch import (
-    replace_llama_with_condense,
-)
+from fastchat.model.llama_condense_monkey_patch import replace_llama_with_condense
 from fastchat.model.model_chatglm import generate_stream_chatglm
 from fastchat.model.model_codet5p import generate_stream_codet5p
 from fastchat.model.model_falcon import generate_stream_falcon
@@ -635,11 +632,14 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("vicuna_v1.1")
 
 
-class CodeT5pAdapter(BaseModelAdapter):
-    """The model adapter for Salesforce/codet5p-6b"""
+class GoogleFlanAdapter(BaseModelAdapter):
+    """The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""
 
     def match(self, model_path: str):
-        return "codet5p" in model_path.lower()
+        return any(
+            model_str in model_path.lower()
+            for model_str in ["flan-", "fastchat-t5", "codet5p"]
+        )
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         revision = from_pretrained_kwargs.get("revision", "main")
@@ -653,28 +653,6 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         return model, tokenizer
 
 
-class T5Adapter(BaseModelAdapter):
-    """The model adapter for lmsys/fastchat-t5-3b-v1.0"""
-
-    def match(self, model_path: str):
-        return "t5" in model_path.lower()
-
-    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
-        revision = from_pretrained_kwargs.get("revision", "main")
-        tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
-        model = AutoModelForSeq2SeqLM.from_pretrained(
-            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
-        )
-        return model, tokenizer
-
-
-class FlanAdapter(T5Adapter):
-    """The model adapter for flan-t5-*, flan-ul2"""
-
-    def match(self, model_path: str):
-        return "flan" in model_path.lower()
-
-
 class KoalaAdapter(BaseModelAdapter):
     """The model adapter for Koala"""
 
@@ -1636,9 +1614,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(VicunaAdapter)
 register_model_adapter(AiroborosAdapter)
 register_model_adapter(LongChatAdapter)
-register_model_adapter(CodeT5pAdapter)
-register_model_adapter(T5Adapter)
-register_model_adapter(FlanAdapter)
+register_model_adapter(GoogleFlanAdapter)
 register_model_adapter(KoalaAdapter)
 register_model_adapter(AlpacaAdapter)
 register_model_adapter(ChatGLMAdapter)

From 16be5cfd0c4621601fa535c829dec25e12063c0f Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 18 Sep 2023 20:41:57 +0000
Subject: [PATCH 38/45] Fix docs

---
 fastchat/llm_judge/README.md    | 1 +
 fastchat/model/model_adapter.py | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md
index eb1e3b2e2..f1755e3e5 100644
--- a/fastchat/llm_judge/README.md
+++ b/fastchat/llm_judge/README.md
@@ -64,6 +64,7 @@ This mode asks GPT-4 to grade and give a score to model's answer directly withou
 For each turn, GPT-4 will give a score on a scale of 10. We then compute the average score on all turns.
 
 ```
+export OPENAI_API_KEY=XXXXXX  # set the OpenAI API key
 python gen_judgment.py --model-list [LIST-OF-MODEL-ID] --parallel [num-concurrent-api-call]
 ```
 
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index e4f049ed3..d2ac56f8d 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -23,6 +23,7 @@
     AutoTokenizer,
     LlamaTokenizer,
     LlamaForCausalLM,
+    T5Tokenizer,
 )
 
 from fastchat.constants import CPU_ISA
@@ -632,7 +633,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("vicuna_v1.1")
 
 
-class GoogleFlanAdapter(BaseModelAdapter):
+class GoogleT5Adapter(BaseModelAdapter):
     """The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""
 
     def match(self, model_path: str):
@@ -643,7 +644,7 @@ def match(self, model_path: str):
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         revision = from_pretrained_kwargs.get("revision", "main")
-        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
         model = AutoModelForSeq2SeqLM.from_pretrained(
             model_path,
             low_cpu_mem_usage=True,
@@ -1614,7 +1615,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(VicunaAdapter)
 register_model_adapter(AiroborosAdapter)
 register_model_adapter(LongChatAdapter)
-register_model_adapter(GoogleFlanAdapter)
+register_model_adapter(GoogleT5Adapter)
 register_model_adapter(KoalaAdapter)
 register_model_adapter(AlpacaAdapter)
 register_model_adapter(ChatGLMAdapter)

From e4758da20dae96486fceb0cce24e5107c4cfc2eb Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 18 Sep 2023 16:31:00 -0700
Subject: [PATCH 39/45] Update UI (#2446)

---
 fastchat/serve/gradio_block_arena_anony.py | 43 +++++++++++++---------
 fastchat/serve/gradio_block_arena_named.py | 32 ++++++++++------
 fastchat/serve/gradio_web_server.py        |  6 ++-
 fastchat/serve/gradio_web_server_multi.py  |  6 +--
 4 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index a92cd9790..2bdb9abbb 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -25,6 +25,7 @@
     no_change_btn,
     enable_btn,
     disable_btn,
+    invisible_btn,
     acknowledgment_md,
     ip_expiration_dict,
 )
@@ -64,7 +65,6 @@ def load_demo_side_by_side_anony(models_, url_params):
             gr.Textbox.update(visible=True),
             gr.Box.update(visible=True),
             gr.Row.update(visible=True),
-            gr.Row.update(visible=True),
             gr.Accordion.update(visible=True),
         )
     )
@@ -148,7 +148,12 @@ def regenerate(state0, state1, request: gr.Request):
 def clear_history(request: gr.Request):
     logger.info(f"clear_history (anony). ip: {request.client.host}")
     return (
-        [None] * num_sides + [None] * num_sides + anony_names + [""] + [disable_btn] * 6
+        [None] * num_sides
+        + [None] * num_sides
+        + anony_names
+        + [""]
+        + [invisible_btn] * 4
+        + [disable_btn] * 2
     )
 
 
@@ -397,11 +402,6 @@ def build_side_by_side_ui_anony(models):
     gr.Markdown(notice_markdown, elem_id="notice_markdown")
 
     with gr.Box(elem_id="share-region-anony"):
-        with gr.Row():
-            for i in range(num_sides):
-                with gr.Column():
-                    model_selectors[i] = gr.Markdown(anony_names[i])
-
         with gr.Row():
             for i in range(num_sides):
                 label = "Model A" if i == 0 else "Model B"
@@ -410,12 +410,22 @@ def build_side_by_side_ui_anony(models):
                         label=label, elem_id=f"chatbot", visible=False, height=550
                     )
 
-        with gr.Box() as button_row:
-            with gr.Row():
-                leftvote_btn = gr.Button(value="👈  A is better", interactive=False)
-                rightvote_btn = gr.Button(value="👉  B is better", interactive=False)
-                tie_btn = gr.Button(value="🤝  Tie", interactive=False)
-                bothbad_btn = gr.Button(value="👎  Both are bad", interactive=False)
+        with gr.Row():
+            for i in range(num_sides):
+                with gr.Column():
+                    model_selectors[i] = gr.Markdown(anony_names[i])
+
+        with gr.Row():
+            leftvote_btn = gr.Button(
+                value="👈  A is better", visible=False, interactive=False
+            )
+            rightvote_btn = gr.Button(
+                value="👉  B is better", visible=False, interactive=False
+            )
+            tie_btn = gr.Button(value="🤝  Tie", visible=False, interactive=False)
+            bothbad_btn = gr.Button(
+                value="👎  Both are bad", visible=False, interactive=False
+            )
 
     with gr.Row():
         with gr.Column(scale=20):
@@ -427,11 +437,11 @@ def build_side_by_side_ui_anony(models):
                 elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Battle", visible=False, variant="primary")
+            send_btn = gr.Button(value="Send", visible=False, variant="primary")
 
-    with gr.Row() as button_row2:
-        regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+    with gr.Row() as button_row:
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
+        regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         share_btn = gr.Button(value="📷  Share")
 
     with gr.Accordion("Parameters", open=False, visible=True) as parameter_row:
@@ -557,6 +567,5 @@ def build_side_by_side_ui_anony(models):
         textbox,
         send_btn,
         button_row,
-        button_row2,
         parameter_row,
     )
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
index 868a5759a..3da3b223a 100644
--- a/fastchat/serve/gradio_block_arena_named.py
+++ b/fastchat/serve/gradio_block_arena_named.py
@@ -24,6 +24,7 @@
     no_change_btn,
     enable_btn,
     disable_btn,
+    invisible_btn,
     acknowledgment_md,
     get_model_description_md,
     ip_expiration_dict,
@@ -69,7 +70,6 @@ def load_demo_side_by_side_named(models, url_params):
             gr.Textbox.update(visible=True),
             gr.Box.update(visible=True),
             gr.Row.update(visible=True),
-            gr.Row.update(visible=True),
             gr.Accordion.update(visible=True),
         )
     )
@@ -137,7 +137,13 @@ def regenerate(state0, state1, request: gr.Request):
 
 def clear_history(request: gr.Request):
     logger.info(f"clear_history (named). ip: {request.client.host}")
-    return [None] * num_sides + [None] * num_sides + [""] + [disable_btn] * 6
+    return (
+        [None] * num_sides
+        + [None] * num_sides
+        + [""]
+        + [invisible_btn] * 4
+        + [disable_btn] * 2
+    )
 
 
 def share_click(state0, state1, model_selector0, model_selector1, request: gr.Request):
@@ -342,12 +348,17 @@ def build_side_by_side_ui_named(models):
                         label=label, elem_id=f"chatbot", visible=False, height=550
                     )
 
-        with gr.Box() as button_row:
-            with gr.Row():
-                leftvote_btn = gr.Button(value="👈  A is better", interactive=False)
-                rightvote_btn = gr.Button(value="👉  B is better", interactive=False)
-                tie_btn = gr.Button(value="🤝  Tie", interactive=False)
-                bothbad_btn = gr.Button(value="👎  Both are bad", interactive=False)
+        with gr.Row():
+            leftvote_btn = gr.Button(
+                value="👈  A is better", visible=False, interactive=False
+            )
+            rightvote_btn = gr.Button(
+                value="👉  B is better", visible=False, interactive=False
+            )
+            tie_btn = gr.Button(value="🤝  Tie", visible=False, interactive=False)
+            bothbad_btn = gr.Button(
+                value="👎  Both are bad", visible=False, interactive=False
+            )
 
     with gr.Row():
         with gr.Column(scale=20):
@@ -359,9 +370,9 @@ def build_side_by_side_ui_named(models):
                 elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Battle", visible=False, variant="primary")
+            send_btn = gr.Button(value="Send", visible=False, variant="primary")
 
-    with gr.Row() as button_row2:
+    with gr.Row() as button_row:
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
         share_btn = gr.Button(value="📷  Share")
@@ -491,6 +502,5 @@ def build_side_by_side_ui_named(models):
         textbox,
         send_btn,
         button_row,
-        button_row2,
         parameter_row,
     )
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
index b17cee42c..3c5e34675 100644
--- a/fastchat/serve/gradio_web_server.py
+++ b/fastchat/serve/gradio_web_server.py
@@ -48,15 +48,17 @@
 headers = {"User-Agent": "FastChat Client"}
 
 no_change_btn = gr.Button.update()
-enable_btn = gr.Button.update(interactive=True)
+enable_btn = gr.Button.update(interactive=True, visible=True)
 disable_btn = gr.Button.update(interactive=False)
+invisible_btn = gr.Button.update(interactive=False, visible=False)
 
 controller_url = None
 enable_moderation = False
 
 acknowledgment_md = """
+### Acknowledgment
 <div class="image-container">
-    <p> <strong>Acknowledgment: </strong> We thank <a href="https://www.kaggle.com/" target="_blank">Kaggle</a>, <a href="https://mbzuai.ac.ae/" target="_blank">MBZUAI</a>, <a href="https://www.anyscale.com/" target="_blank">AnyScale</a>, and <a href="https://huggingface.co/" target="_blank">HuggingFace</a> for their sponsorship. </p>
+    <p> We thank <a href="https://www.kaggle.com/" target="_blank">Kaggle</a>, <a href="https://mbzuai.ac.ae/" target="_blank">MBZUAI</a>, <a href="https://www.anyscale.com/" target="_blank">AnyScale</a>, and <a href="https://huggingface.co/" target="_blank">HuggingFace</a> for their <a href="https://lmsys.org/donations/" target="_blank">sponsorship</a>. </p>
     <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/7c/Kaggle_logo.png/400px-Kaggle_logo.png" alt="Image 1">
     <img src="https://mma.prnewswire.com/media/1227419/MBZUAI_Logo.jpg?p=facebookg" alt="Image 2">
     <img src="https://docs.anyscale.com/site-assets/logo.png" alt="Image 3">
diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py
index 09e227f90..97aecc66c 100644
--- a/fastchat/serve/gradio_web_server_multi.py
+++ b/fastchat/serve/gradio_web_server_multi.py
@@ -100,7 +100,7 @@ def load_demo(url_params, request: gr.Request):
 def build_demo(models, elo_results_file, leaderboard_table_file):
     with gr.Blocks(
         title="Chat with Open Large Language Models",
-        theme=gr.themes.Base(),
+        theme=gr.themes.Default(),
         css=block_css,
     ) as demo:
         with gr.Tabs() as tabs:
@@ -112,7 +112,6 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
                     b_textbox,
                     b_send_btn,
                     b_button_row,
-                    b_button_row2,
                     b_parameter_row,
                 ) = build_side_by_side_ui_anony(models)
                 b_list = (
@@ -123,7 +122,6 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
                         b_textbox,
                         b_send_btn,
                         b_button_row,
-                        b_button_row2,
                         b_parameter_row,
                     ]
                 )
@@ -136,7 +134,6 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
                     c_textbox,
                     c_send_btn,
                     c_button_row,
-                    c_button_row2,
                     c_parameter_row,
                 ) = build_side_by_side_ui_named(models)
                 c_list = (
@@ -147,7 +144,6 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
                         c_textbox,
                         c_send_btn,
                         c_button_row,
-                        c_button_row2,
                         c_parameter_row,
                     ]
                 )

From 68f1facfb431f07326c72df2484c78bfe37ac77a Mon Sep 17 00:00:00 2001
From: Brandon Biggs <brandonsbiggs@gmail.com>
Date: Tue, 19 Sep 2023 13:39:33 -0600
Subject: [PATCH 40/45] Add Optional SSL Support to controller.py (#2448)

---
 fastchat/serve/controller.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/fastchat/serve/controller.py b/fastchat/serve/controller.py
index 04f119f72..3c0518e8e 100644
--- a/fastchat/serve/controller.py
+++ b/fastchat/serve/controller.py
@@ -8,6 +8,7 @@
 from enum import Enum, auto
 import json
 import logging
+import os
 import time
 from typing import List, Union
 import threading
@@ -318,6 +319,13 @@ def create_controller():
         choices=["lottery", "shortest_queue"],
         default="shortest_queue",
     )
+    parser.add_argument(
+        "--ssl",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.",
+    )
     args = parser.parse_args()
     logger.info(f"args: {args}")
 
@@ -327,4 +335,14 @@ def create_controller():
 
 if __name__ == "__main__":
     args, controller = create_controller()
-    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
+    if args.ssl:
+        uvicorn.run(
+            app, 
+            host=args.host, 
+            port=args.port, 
+            log_level="info", 
+            ssl_keyfile=os.environ["SSL_KEYFILE"], 
+            ssl_certfile=os.environ["SSL_CERTFILE"]
+        )
+    else:
+        uvicorn.run(app, host=args.host, port=args.port, log_level="info")

From db8e2714d2ed2202f399a40919a4854b7895210b Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 19 Sep 2023 19:48:45 +0000
Subject: [PATCH 41/45] Format & Improve docs

---
 README.md                    |  1 +
 fastchat/serve/controller.py | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 7d3c7b20b..267fad328 100644
--- a/README.md
+++ b/README.md
@@ -238,6 +238,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m fastchat.serve.model_worker --model-path lmsys
 ```bash
 python3 -m fastchat.serve.gradio_web_server_multi
 ```
+- The default model worker based on huggingface/transformers has great compatibility but can be slow. If you want high-throughput serving, you can try [vLLM integration](docs/vllm_integration.md).
 
 ## API
 ### OpenAI-Compatible RESTful APIs & SDK
diff --git a/fastchat/serve/controller.py b/fastchat/serve/controller.py
index 3c0518e8e..a67da62c4 100644
--- a/fastchat/serve/controller.py
+++ b/fastchat/serve/controller.py
@@ -337,12 +337,12 @@ def create_controller():
     args, controller = create_controller()
     if args.ssl:
         uvicorn.run(
-            app, 
-            host=args.host, 
-            port=args.port, 
-            log_level="info", 
-            ssl_keyfile=os.environ["SSL_KEYFILE"], 
-            ssl_certfile=os.environ["SSL_CERTFILE"]
+            app,
+            host=args.host,
+            port=args.port,
+            log_level="info",
+            ssl_keyfile=os.environ["SSL_KEYFILE"],
+            ssl_certfile=os.environ["SSL_CERTFILE"],
         )
     else:
         uvicorn.run(app, host=args.host, port=args.port, log_level="info")

From c4c195cdfe5a2b97bff534b3fb8f282820c1148a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 19 Sep 2023 19:45:41 -0700
Subject: [PATCH 42/45] Release v0.2.29 (#2450)

---
 fastchat/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastchat/__init__.py b/fastchat/__init__.py
index 968391a2d..4f6b515ec 100644
--- a/fastchat/__init__.py
+++ b/fastchat/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.28"
+__version__ = "0.2.29"
diff --git a/pyproject.toml b/pyproject.toml
index c3ce59364..b7109f059 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fschat"
-version = "0.2.28"
+version = "0.2.29"
 description = "An open platform for training, serving, and evaluating large language model based chatbots."
 readme = "README.md"
 requires-python = ">=3.8"

From a040cdca3c01f37aa91ba2d56a8fd57f9bbbc948 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 21 Sep 2023 17:01:53 -0700
Subject: [PATCH 43/45] Show terms of use as an JS alert (#2461)

---
 docs/commands/webserver.md                    |   2 +-
 fastchat/llm_judge/common.py                  |  29 +++++
 fastchat/model/model_registry.py              |   8 +-
 fastchat/serve/gradio_block_arena_anony.py    |  32 +-----
 fastchat/serve/gradio_block_arena_named.py    |  35 ++----
 fastchat/serve/gradio_web_server.py           |  88 +++++++--------
 fastchat/serve/gradio_web_server_multi.py     | 100 ++++++------------
 .../lmsys_chat_1m/sample.py                   |  32 ++++++
 fastchat/serve/monitor/summarize_cluster.py   |  18 +++-
 fastchat/serve/monitor/topic_clustering.py    |  22 ++--
 fastchat/utils.py                             |  14 +++
 11 files changed, 193 insertions(+), 187 deletions(-)
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/sample.py

diff --git a/docs/commands/webserver.md b/docs/commands/webserver.md
index 920f15aa0..b6342c682 100644
--- a/docs/commands/webserver.md
+++ b/docs/commands/webserver.md
@@ -27,7 +27,7 @@ cd fastchat_logs/server0
 export OPENAI_API_KEY=
 export ANTHROPIC_API_KEY=
 
-python3 -m fastchat.serve.gradio_web_server_multi --controller http://localhost:21001 --concurrency 10 --add-chatgpt --add-claude --add-palm --anony-only --elo ~/elo_results/elo_results_20230802.pkl --leaderboard-table-file ~/elo_results/leaderboard_table_20230802.csv --register ~/elo_results/register_oai_models.json
+python3 -m fastchat.serve.gradio_web_server_multi --controller http://localhost:21001 --concurrency 10 --add-chatgpt --add-claude --add-palm --anony-only --elo ~/elo_results/elo_results.pkl --leaderboard-table-file ~/elo_results/leaderboard_table.csv --register ~/elo_results/register_oai_models.json --show-terms
 
 python3 backup_logs.py
 ```
diff --git a/fastchat/llm_judge/common.py b/fastchat/llm_judge/common.py
index ad1180034..abe1ec6cc 100644
--- a/fastchat/llm_judge/common.py
+++ b/fastchat/llm_judge/common.py
@@ -418,6 +418,35 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens):
     return output
 
 
+def chat_compeletion_openai_azure(model, conv, temperature, max_tokens):
+    openai.api_type = "azure"
+    openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"]
+    openai.api_key = os.environ["AZURE_OPENAI_KEY"]
+    openai.api_version = "2023-05-15"
+
+    if "azure-" in model:
+        model = model[6:]
+
+    output = API_ERROR_OUTPUT
+    for _ in range(API_MAX_RETRY):
+        try:
+            messages = conv.to_openai_api_messages()
+            response = openai.ChatCompletion.create(
+                engine=model,
+                messages=messages,
+                n=1,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+            output = response["choices"][0]["message"]["content"]
+            break
+        except openai.error.OpenAIError as e:
+            print(type(e), e)
+            time.sleep(API_RETRY_SLEEP)
+
+    return output
+
+
 def chat_compeletion_anthropic(model, conv, temperature, max_tokens):
     output = API_ERROR_OUTPUT
     for _ in range(API_MAX_RETRY):
diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py
index 0612ca832..19a513eaa 100644
--- a/fastchat/model/model_registry.py
+++ b/fastchat/model/model_registry.py
@@ -19,7 +19,13 @@ def register_model_info(
 
 
 def get_model_info(name: str) -> ModelInfo:
-    return model_info[name]
+    if name in model_info:
+        return model_info[name]
+    else:
+        # To fix this, please use `register_model_info` to register your model
+        return ModelInfo(
+            name, "", "Register the description at fastchat/model/model_registry.py"
+        )
 
 
 register_model_info(
diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index 2bdb9abbb..edd89d072 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -57,17 +57,7 @@ def load_demo_side_by_side_anony(models_, url_params):
         gr.Markdown.update(visible=True),
     )
 
-    return (
-        states
-        + selector_updates
-        + (gr.Chatbot.update(visible=True),) * num_sides
-        + (
-            gr.Textbox.update(visible=True),
-            gr.Box.update(visible=True),
-            gr.Row.update(visible=True),
-            gr.Accordion.update(visible=True),
-        )
-    )
+    return states + selector_updates
 
 
 def vote_last_response(states, vote_type, model_selectors, request: gr.Request):
@@ -388,9 +378,6 @@ def build_side_by_side_ui_anony(models):
 ### Leaderboard
 See [lmsys/chatbot-arena-leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) or the 4th tab above on this page.
 
-### Terms of use
-By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. **The service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license.** The demo works better on desktop devices with a wide screen.
-
 ### Battle
 Please scroll down and start chatting. The models include both closed-source models (e.g., ChatGPT) and open-source models (e.g., Llama).
 """
@@ -407,7 +394,7 @@ def build_side_by_side_ui_anony(models):
                 label = "Model A" if i == 0 else "Model B"
                 with gr.Column():
                     chatbots[i] = gr.Chatbot(
-                        label=label, elem_id=f"chatbot", visible=False, height=550
+                        label=label, elem_id=f"chatbot", height=550
                     )
 
         with gr.Row():
@@ -432,19 +419,18 @@ def build_side_by_side_ui_anony(models):
             textbox = gr.Textbox(
                 show_label=False,
                 placeholder="Enter your prompt here and press ENTER",
-                visible=False,
                 container=False,
                 elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False, variant="primary")
+            send_btn = gr.Button(value="Send", variant="primary")
 
     with gr.Row() as button_row:
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         share_btn = gr.Button(value="📷  Share")
 
-    with gr.Accordion("Parameters", open=False, visible=True) as parameter_row:
+    with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
             maximum=1.0,
@@ -560,12 +546,4 @@ def build_side_by_side_ui_anony(models):
         flash_buttons, [], btn_list
     )
 
-    return (
-        states,
-        model_selectors,
-        chatbots,
-        textbox,
-        send_btn,
-        button_row,
-        parameter_row,
-    )
+    return states + model_selectors
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
index 3da3b223a..6c2d0b534 100644
--- a/fastchat/serve/gradio_block_arena_named.py
+++ b/fastchat/serve/gradio_block_arena_named.py
@@ -62,17 +62,7 @@ def load_demo_side_by_side_named(models, url_params):
         gr.Dropdown.update(choices=models, value=model_right, visible=True),
     )
 
-    return (
-        states
-        + selector_updates
-        + (gr.Chatbot.update(visible=True),) * num_sides
-        + (
-            gr.Textbox.update(visible=True),
-            gr.Box.update(visible=True),
-            gr.Row.update(visible=True),
-            gr.Accordion.update(visible=True),
-        )
-    )
+    return states + selector_updates
 
 
 def vote_last_response(states, vote_type, model_selectors, request: gr.Request):
@@ -313,10 +303,10 @@ def build_side_by_side_ui_named(models):
 - You can do multiple turns of conversations before voting.
 - Click "Clear history" to start a new round.
 
-### Terms of use
-By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. **The service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license.** The demo works better on desktop devices with a wide screen.
+### Leaderboard
+See [lmsys/chatbot-arena-leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) or the 4th tab above on this page.
 
-### Choose two models to chat with (view [leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard))
+### Choose two models to chat with
 """
 
     states = [gr.State() for _ in range(num_sides)]
@@ -345,7 +335,7 @@ def build_side_by_side_ui_named(models):
                 label = "Model A" if i == 0 else "Model B"
                 with gr.Column():
                     chatbots[i] = gr.Chatbot(
-                        label=label, elem_id=f"chatbot", visible=False, height=550
+                        label=label, elem_id=f"chatbot", height=550
                     )
 
         with gr.Row():
@@ -365,19 +355,18 @@ def build_side_by_side_ui_named(models):
             textbox = gr.Textbox(
                 show_label=False,
                 placeholder="Enter your prompt here and press ENTER",
-                visible=False,
                 container=False,
                 elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False, variant="primary")
+            send_btn = gr.Button(value="Send", variant="primary")
 
     with gr.Row() as button_row:
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
         share_btn = gr.Button(value="📷  Share")
 
-    with gr.Accordion("Parameters", open=False, visible=True) as parameter_row:
+    with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
             maximum=1.0,
@@ -495,12 +484,4 @@ def build_side_by_side_ui_named(models):
         flash_buttons, [], btn_list
     )
 
-    return (
-        states,
-        model_selectors,
-        chatbots,
-        textbox,
-        send_btn,
-        button_row,
-        parameter_row,
-    )
+    return states + model_selectors
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
index 3c5e34675..24db98b34 100644
--- a/fastchat/serve/gradio_web_server.py
+++ b/fastchat/serve/gradio_web_server.py
@@ -28,7 +28,7 @@
     SESSION_EXPIRATION_TIME,
 )
 from fastchat.model.model_adapter import get_conversation_template
-from fastchat.model.model_registry import model_info
+from fastchat.model.model_registry import get_model_info, model_info
 from fastchat.serve.api_provider import (
     anthropic_api_stream_iter,
     openai_api_stream_iter,
@@ -39,6 +39,7 @@
     build_logger,
     violates_moderation,
     get_window_url_params_js,
+    get_window_url_params_with_tos_js,
     parse_gradio_auth_creds,
 )
 
@@ -163,15 +164,7 @@ def load_demo_single(models, url_params):
     )
 
     state = None
-    return (
-        state,
-        dropdown_update,
-        gr.Chatbot.update(visible=True),
-        gr.Textbox.update(visible=True),
-        gr.Button.update(visible=True),
-        gr.Row.update(visible=True),
-        gr.Accordion.update(visible=True),
-    )
+    return state, dropdown_update
 
 
 def load_demo(url_params, request: gr.Request):
@@ -530,17 +523,11 @@ def get_model_description_md(models):
     ct = 0
     visited = set()
     for i, name in enumerate(models):
-        if name in model_info:
-            minfo = model_info[name]
-            if minfo.simple_name in visited:
-                continue
-            visited.add(minfo.simple_name)
-            one_model_md = f"[{minfo.simple_name}]({minfo.link}): {minfo.description}"
-        else:
-            visited.add(name)
-            one_model_md = (
-                f"[{name}](): Add the description at fastchat/model/model_registry.py"
-            )
+        minfo = get_model_info(name)
+        if minfo.simple_name in visited:
+            continue
+        visited.add(minfo.simple_name)
+        one_model_md = f"[{minfo.simple_name}]({minfo.link}): {minfo.description}"
 
         if ct % 3 == 0:
             model_description_md += "|"
@@ -566,9 +553,6 @@ def build_single_model_ui(models, add_promotion_links=False):
 # 🏔️ Chat with Open Large Language Models
 {promotion}
 
-### Terms of use
-By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. **The service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license.**
-
 ### Choose a model to chat with
 """
 
@@ -588,7 +572,6 @@ def build_single_model_ui(models, add_promotion_links=False):
     chatbot = gr.Chatbot(
         elem_id="chatbot",
         label="Scroll down and start chatting",
-        visible=False,
         height=550,
     )
     with gr.Row():
@@ -596,21 +579,20 @@ def build_single_model_ui(models, add_promotion_links=False):
             textbox = gr.Textbox(
                 show_label=False,
                 placeholder="Enter your prompt here and press ENTER",
-                visible=False,
                 container=False,
                 elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False, variant="primary")
+            send_btn = gr.Button(value="Send", variant="primary")
 
-    with gr.Row(visible=False) as button_row:
+    with gr.Row() as button_row:
         upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
         downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
         flag_btn = gr.Button(value="⚠️  Flag", interactive=False)
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
 
-    with gr.Accordion("Parameters", open=False, visible=False) as parameter_row:
+    with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
             maximum=1.0,
@@ -673,49 +655,44 @@ def build_single_model_ui(models, add_promotion_links=False):
         [state, chatbot] + btn_list,
     )
     send_btn.click(
-        add_text, [state, model_selector, textbox], [state, chatbot, textbox] + btn_list
+        add_text,
+        [state, model_selector, textbox],
+        [state, chatbot, textbox] + btn_list,
     ).then(
         bot_response,
         [state, temperature, top_p, max_output_tokens],
         [state, chatbot] + btn_list,
     )
 
-    return state, model_selector, chatbot, textbox, send_btn, button_row, parameter_row
+    return [state, model_selector]
 
 
 def build_demo(models):
     with gr.Blocks(
         title="Chat with Open Large Language Models",
-        theme=gr.themes.Base(),
+        theme=gr.themes.Default(),
         css=block_css,
     ) as demo:
         url_params = gr.JSON(visible=False)
 
-        (
-            state,
-            model_selector,
-            chatbot,
-            textbox,
-            send_btn,
-            button_row,
-            parameter_row,
-        ) = build_single_model_ui(models)
+        state, model_selector = build_single_model_ui(models)
 
         if args.model_list_mode not in ["once", "reload"]:
             raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
+
+        if args.show_terms_of_use:
+            load_js = get_window_url_params_with_tos_js
+        else:
+            load_js = get_window_url_params_js
+
         demo.load(
             load_demo,
             [url_params],
             [
                 state,
                 model_selector,
-                chatbot,
-                textbox,
-                send_btn,
-                button_row,
-                parameter_row,
             ],
-            _js=get_window_url_params_js,
+            _js=load_js,
         )
 
     return demo
@@ -728,29 +705,36 @@ def build_demo(models):
     parser.add_argument(
         "--share",
         action="store_true",
-        help="Whether to generate a public, shareable link.",
+        help="Whether to generate a public, shareable link",
     )
     parser.add_argument(
         "--controller-url",
         type=str,
         default="http://localhost:21001",
-        help="The address of the controller.",
+        help="The address of the controller",
     )
     parser.add_argument(
         "--concurrency-count",
         type=int,
         default=10,
-        help="The concurrency count of the gradio queue.",
+        help="The concurrency count of the gradio queue",
     )
     parser.add_argument(
         "--model-list-mode",
         type=str,
         default="once",
         choices=["once", "reload"],
-        help="Whether to load the model list once or reload the model list every time.",
+        help="Whether to load the model list once or reload the model list every time",
+    )
+    parser.add_argument(
+        "--moderate",
+        action="store_true",
+        help="Enable content moderation to block unsafe inputs",
     )
     parser.add_argument(
-        "--moderate", action="store_true", help="Enable content moderation"
+        "--show-terms-of-use",
+        action="store_true",
+        help="Shows term of use before loading the demo",
     )
     parser.add_argument(
         "--add-chatgpt",
diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py
index 97aecc66c..92618d911 100644
--- a/fastchat/serve/gradio_web_server_multi.py
+++ b/fastchat/serve/gradio_web_server_multi.py
@@ -34,6 +34,7 @@
 from fastchat.utils import (
     build_logger,
     get_window_url_params_js,
+    get_window_url_params_with_tos_js,
     parse_gradio_auth_creds,
 )
 
@@ -105,68 +106,15 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
     ) as demo:
         with gr.Tabs() as tabs:
             with gr.Tab("Chatbot Arena (battle)", id=0):
-                (
-                    b_states,
-                    b_model_selectors,
-                    b_chatbots,
-                    b_textbox,
-                    b_send_btn,
-                    b_button_row,
-                    b_parameter_row,
-                ) = build_side_by_side_ui_anony(models)
-                b_list = (
-                    b_states
-                    + b_model_selectors
-                    + b_chatbots
-                    + [
-                        b_textbox,
-                        b_send_btn,
-                        b_button_row,
-                        b_parameter_row,
-                    ]
-                )
+                side_by_side_anony_list = build_side_by_side_ui_anony(models)
 
             with gr.Tab("Chatbot Arena (side-by-side)", id=1):
-                (
-                    c_states,
-                    c_model_selectors,
-                    c_chatbots,
-                    c_textbox,
-                    c_send_btn,
-                    c_button_row,
-                    c_parameter_row,
-                ) = build_side_by_side_ui_named(models)
-                c_list = (
-                    c_states
-                    + c_model_selectors
-                    + c_chatbots
-                    + [
-                        c_textbox,
-                        c_send_btn,
-                        c_button_row,
-                        c_parameter_row,
-                    ]
-                )
+                side_by_side_named_list = build_side_by_side_ui_named(models)
 
             with gr.Tab("Single Model", id=2):
-                (
-                    a_state,
-                    a_model_selector,
-                    a_chatbot,
-                    a_textbox,
-                    a_send_btn,
-                    a_button_row,
-                    a_parameter_row,
-                ) = build_single_model_ui(models, add_promotion_links=True)
-                a_list = [
-                    a_state,
-                    a_model_selector,
-                    a_chatbot,
-                    a_textbox,
-                    a_send_btn,
-                    a_button_row,
-                    a_parameter_row,
-                ]
+                single_model_list = build_single_model_ui(
+                    models, add_promotion_links=True
+                )
 
             if elo_results_file:
                 with gr.Tab("Leaderboard", id=3):
@@ -176,11 +124,20 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
 
         if args.model_list_mode not in ["once", "reload"]:
             raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
+
+        if args.show_terms_of_use:
+            load_js = get_window_url_params_with_tos_js
+        else:
+            load_js = get_window_url_params_js
+
         demo.load(
             load_demo,
             [url_params],
-            [tabs] + a_list + b_list + c_list,
-            _js=get_window_url_params_js,
+            [tabs]
+            + single_model_list
+            + side_by_side_anony_list
+            + side_by_side_named_list,
+            _js=load_js,
         )
 
     return demo
@@ -193,19 +150,19 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
     parser.add_argument(
         "--share",
         action="store_true",
-        help="Whether to generate a public, shareable link.",
+        help="Whether to generate a public, shareable link",
     )
     parser.add_argument(
         "--controller-url",
         type=str,
         default="http://localhost:21001",
-        help="The address of the controller.",
+        help="The address of the controller",
     )
     parser.add_argument(
         "--concurrency-count",
         type=int,
         default=10,
-        help="The concurrency count of the gradio queue.",
+        help="The concurrency count of the gradio queue",
     )
     parser.add_argument(
         "--model-list-mode",
@@ -215,7 +172,14 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
         help="Whether to load the model list once or reload the model list every time.",
     )
     parser.add_argument(
-        "--moderate", action="store_true", help="Enable content moderation"
+        "--moderate",
+        action="store_true",
+        help="Enable content moderation to block unsafe inputs",
+    )
+    parser.add_argument(
+        "--show-terms-of-use",
+        action="store_true",
+        help="Shows term of use before loading the demo",
     )
     parser.add_argument(
         "--add-chatgpt",
@@ -248,8 +212,12 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
         help='Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3"',
         default=None,
     )
-    parser.add_argument("--elo-results-file", type=str)
-    parser.add_argument("--leaderboard-table-file", type=str)
+    parser.add_argument(
+        "--elo-results-file", type=str, help="Load leaderboard results and plots"
+    )
+    parser.add_argument(
+        "--leaderboard-table-file", type=str, help="Load leaderboard results and plots"
+    )
     args = parser.parse_args()
     logger.info(f"args: {args}")
 
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/sample.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/sample.py
new file mode 100644
index 000000000..3b6da455f
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/sample.py
@@ -0,0 +1,32 @@
+"""
+Count the unique users in a battle log file.
+
+Usage:
+python3 -input in.json --number 1000
+"""
+
+import argparse
+import json
+import random
+
+K = 1000
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str)
+    parser.add_argument("--number", type=int, nargs="+")
+    args = parser.parse_args()
+
+    convs = json.load(open(args.input))
+    random.seed(42)
+    random.shuffle(convs)
+
+    for number in args.number:
+        new_convs = convs[:number]
+
+        output = args.input.replace(".json", f"_{number//K}k.json")
+        with open(output, "w") as fout:
+            json.dump(new_convs, fout, indent=2, ensure_ascii=False)
+
+        print(f"#in: {len(convs)}, #out: {len(new_convs)}")
+        print(f"Write to file: {output}")
diff --git a/fastchat/serve/monitor/summarize_cluster.py b/fastchat/serve/monitor/summarize_cluster.py
index aa53c265d..1d5fbcddc 100644
--- a/fastchat/serve/monitor/summarize_cluster.py
+++ b/fastchat/serve/monitor/summarize_cluster.py
@@ -1,12 +1,14 @@
 """
 Usage:
-python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4
+python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4 --num-prompts 100
+python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model azure-gpt-4-32k --num-prompts 200
 """
 import argparse
 import pickle
 
 from fastchat.llm_judge.common import (
     chat_compeletion_openai,
+    chat_compeletion_openai_azure,
     chat_compeletion_anthropic,
 )
 from fastchat.conversation import get_conv_template
@@ -32,18 +34,26 @@ def truncate_string(s, l):
     topics = []
     percentages = []
     for i, info in enumerate(cluster_infos):
-        num_samples, prompts = info
+        num_samples, topk_prompts, random_prompts = info
         percentage = num_samples / num_total_prompts
         print(
             f"cluster {i}, #prompts {num_samples}, percentage: {percentage * 100:.2f}%"
         )
         instruct = "Given a list of user messages, use less than 8 words to summarize a central topic for all messages in English. Your output should only include a single line. Try to be specific."
+        split = int(args.num_prompts * 0.8)
         prompt = "\n".join(
-            [truncate_string(x, l=200) for x in prompts[: args.num_prompts]]
+            [truncate_string(x, l=200) for x in topk_prompts[:split]]
+            + [
+                truncate_string(x, l=200)
+                for x in random_prompts[: args.num_prompts - split]
+            ]
         )
         prompt = "BEGIN OF THE MESSAGE LIST\n" + prompt + "\nEND OF THE MESSAGE LIST."
 
-        if "gpt" in model:
+        if "azure-" in model:
+            template_name = "chatgpt"
+            completion_func = chat_compeletion_openai_azure
+        elif "gpt" in model:
             template_name = "chatgpt"
             completion_func = chat_compeletion_openai
         elif "claude" in model:
diff --git a/fastchat/serve/monitor/topic_clustering.py b/fastchat/serve/monitor/topic_clustering.py
index 7710ce42a..dd15c6edc 100644
--- a/fastchat/serve/monitor/topic_clustering.py
+++ b/fastchat/serve/monitor/topic_clustering.py
@@ -2,7 +2,7 @@
 
 Usage:
 python3 topic_clustering.py --in arena.json --english-only --min-length 32
-python3 topic_clustering.py --in clean_conv_20230809_100k.json --english-only --min-length 32 --max-length 1024
+python3 topic_clustering.py --in clean_conv_20230809_100k.json --english-only --min-length 32 --max-length 1536
 """
 import argparse
 import json
@@ -90,7 +90,7 @@ def get_embeddings(texts, model_name, batch_size):
 
 
 def run_k_means(embeddings, num_clusters):
-    np.random.seed(0)
+    np.random.seed(42)
     clustering_model = KMeans(n_clusters=num_clusters, n_init="auto")
     clustering_model.fit(embeddings.numpy())
     centers = torch.from_numpy(clustering_model.cluster_centers_)
@@ -109,7 +109,7 @@ def run_k_means(embeddings, num_clusters):
 
 
 def run_agg_cluster(embeddings, num_clusters):
-    np.random.seed(0)
+    np.random.seed(42)
     clustering_model = AgglomerativeClustering(n_clusters=num_clusters)
     clustering_model.fit(embeddings)
     labels = torch.from_numpy(clustering_model.labels_)
@@ -133,7 +133,7 @@ def run_agg_cluster(embeddings, num_clusters):
 def run_hdbscan_cluster(embeddings):
     import hdbscan
 
-    np.random.seed(0)
+    np.random.seed(42)
     clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
     labels = torch.from_numpy(clusterer.fit_predict(embeddings))
 
@@ -183,13 +183,18 @@ def print_topk(texts, labels, topk_indices, show_cut_off):
 
 
 def get_cluster_info(texts, labels, topk_indices):
+    np.random.seed(42)
+
     cluster_info = []
     for k in range(len(topk_indices)):
         num_samples = torch.sum(labels == k).item()
-        prompts = []
+        topk_prompts = []
         for idx in topk_indices[k]:
-            prompts.append(texts[idx])
-        cluster_info.append((num_samples, prompts))
+            topk_prompts.append(texts[idx])
+        random_prompts = []
+        for idx in range(len(topk_indices)):
+            random_prompts.append(np.random.choice(texts))
+        cluster_info.append((num_samples, topk_prompts, random_prompts))
 
     return cluster_info
 
@@ -238,8 +243,6 @@ def get_cluster_info(texts, labels, topk_indices):
     topk_str = print_topk(texts, labels, topk_indices, args.show_cut_off)
     num_clusters = len(centers)
 
-    cluster_info = get_cluster_info(texts, labels, topk_indices)
-
     # Dump results
     filename_prefix = f"results_c{num_clusters}_{args.cluster_alg}"
     print(topk_str)
@@ -259,5 +262,6 @@ def get_cluster_info(texts, labels, topk_indices):
                 obj = {"cluster": i, "text": text, "sim": score.item()}
                 fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
 
+    cluster_info = get_cluster_info(texts, labels, topk_indices)
     with open(filename_prefix + "_cluster.pkl", "wb") as fout:
         pickle.dump(cluster_info, fout)
diff --git a/fastchat/utils.py b/fastchat/utils.py
index e2d3a6ac6..947d8b687 100644
--- a/fastchat/utils.py
+++ b/fastchat/utils.py
@@ -201,6 +201,20 @@ def pretty_print_semaphore(semaphore):
 """
 
 
+get_window_url_params_with_tos_js = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log("url_params", url_params);
+
+    msg = "Users of this website are required to agree to the following terms:\\nThe service is a research preview. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes.\\nThe service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license."
+    alert(msg);
+
+    return url_params;
+    }
+"""
+
+
 def iter_over_async(
     async_gen: AsyncGenerator, event_loop: AbstractEventLoop
 ) -> Generator:

From bcb8076c36f119bc62b2c495dc06ae993694d6ae Mon Sep 17 00:00:00 2001
From: dongxiaolong <774848421@qq.com>
Date: Fri, 22 Sep 2023 11:51:58 +0800
Subject: [PATCH 44/45] vllm worker awq quantization update (#2463)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: 董晓龙 <dongxiaolong@shiyanjia.com>
---
 docs/vllm_integration.md      | 5 +++++
 fastchat/serve/vllm_worker.py | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/docs/vllm_integration.md b/docs/vllm_integration.md
index 1886b1009..021fc3853 100644
--- a/docs/vllm_integration.md
+++ b/docs/vllm_integration.md
@@ -18,3 +18,8 @@ See the supported models [here](https://vllm.readthedocs.io/en/latest/models/sup
    ```
    python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.3 --tokenizer hf-internal-testing/llama-tokenizer
    ```
+
+   if you use a awq model, try
+   '''
+   python3 -m fastchat.serve.vllm_worker --model-path TheBloke/vicuna-7B-v1.5-AWQ --quantization awq
+   '''
diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index 8e255b79c..2fe8e6304 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -210,6 +210,8 @@ async def api_model_details(request: Request):
         args.model = args.model_path
     if args.num_gpus > 1:
         args.tensor_parallel_size = args.num_gpus
+    if args.quantizaiton:
+        args.quantization = args.quantization
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngine.from_engine_args(engine_args)

From 2855bf974f0973f85adb2bb7a9d075255b353ecf Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 21 Sep 2023 22:45:34 -0700
Subject: [PATCH 45/45] Fix falcon chat template (#2464)

---
 fastchat/conversation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index 763856f85..869bfd4bf 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -204,12 +204,12 @@ def get_prompt(self) -> str:
         elif self.sep_style == SeparatorStyle.FALCON_CHAT:
             ret = ""
             if self.system_message:
-                ret += "System: " + self.system_message + self.sep
+                ret += system_prompt + self.sep
             for role, message in self.messages:
                 if message:
                     ret += role + ": " + message + self.sep
                 else:
-                    ret += role + ": "
+                    ret += role + ":"
 
             return ret
         else:
@@ -958,6 +958,7 @@ def get_conv_template(name: str) -> Conversation:
     Conversation(
         name="falcon-chat",
         roles=("User", "Falcon"),
+        system_template="System: {system_message}",
         messages=[],
         sep_style=SeparatorStyle.FALCON_CHAT,
         sep="\n",