From a887de74f6a4e841e676fc3966eecaf444a95508 Mon Sep 17 00:00:00 2001
From: Ning Ren <renning22@users.noreply.github.com>
Date: Fri, 22 Sep 2023 22:12:43 -0700
Subject: [PATCH] Merge 0922 (#6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Remove hardcode flash-attn disable setting (#2342)

* Document turning off proxy_buffering when api is streaming (#2337)

* Simplify huggingface api example (#2355)

* Update sponsor logos (#2367)

* if LOGDIR is empty, then don't try output log to local file (#2357)

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>

* add best_of and use_beam_search for completions interface (#2348)

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>

* Extract upvote/downvote from log files (#2369)

* Revert "add best_of and use_beam_search for completions interface" (#2370)

* Improve doc (#2371)

* add best_of and use_beam_search for completions interface (#2372)

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>

* update monkey patch for llama2 (#2379)

* Make E5 adapter more restrict to reduce mismatch (#2381)

* Update UI and sponsers (#2387)

* Use fsdp api for save save (#2390)

* Release v0.2.27

* Spicyboros + airoboros 2.2 template update. (#2392)

Co-authored-by: Jon Durbin <jon.durbin@onna.com>

* bugfix of openai_api_server for fastchat.serve.vllm_worker (#2398)

Co-authored-by: wuyongyu <wuyongyu@atomecho.xyz>

* Revert "bugfix of openai_api_server for fastchat.serve.vllm_worker" (#2400)

* Revert "add best_of and use_beam_search for completions interface" (#2401)

* Release a v0.2.28 with bug fixes and more test cases

* Fix model_worker error (#2404)

* Added google/flan models and fixed AutoModelForSeq2SeqLM when loading T5 compression model (#2402)

* Rename twitter to X (#2406)

* Update huggingface_api.py (#2409)

* Add support for baichuan2 models (#2408)

* Fixed character overlap issue when api streaming output (#2431)

* Support custom conversation template in multi_model_worker (#2434)

* Add Ascend NPU support (#2422)

* Add raw conversation template (#2417) (#2418)

* Improve docs & UI (#2436)

* Fix Salesforce xgen inference (#2350)

* Add support for Phind-CodeLlama models (#2415) (#2416)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>

* Add falcon 180B chat conversation template (#2384)

* Improve docs (#2438)

* add dtype and seed (#2430)

* Data cleaning scripts for dataset release (#2440)

* merge google/flan based adapters: T5Adapter, CodeT5pAdapter, FlanAdapter (#2411)

* Fix docs

* Update UI (#2446)

* Add Optional SSL Support to controller.py (#2448)

* Format & Improve docs

* Release v0.2.29 (#2450)

* Show terms of use as an JS alert (#2461)

* vllm worker awq quantization update (#2463)

Co-authored-by: 董晓龙 <dongxiaolong@shiyanjia.com>

* Fix falcon chat template (#2464)

---------

Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Trangle <kw_w@foxmail.com>
Co-authored-by: Nathan Stitt <nathan@stitt.org>
Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
Co-authored-by: leiwen83 <leiwen83@users.noreply.github.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Jon Durbin <jon@jondurbin.com>
Co-authored-by: Jon Durbin <jon.durbin@onna.com>
Co-authored-by: Rayrtfr <2384172887@qq.com>
Co-authored-by: wuyongyu <wuyongyu@atomecho.xyz>
Co-authored-by: wangxiyuan <wangxiyuan@huawei.com>
Co-authored-by: Jeff (Zhen) Wang <wangzhen263@gmail.com>
Co-authored-by: karshPrime <94996251+karshPrime@users.noreply.github.com>
Co-authored-by: obitolyz <obitoquilt@qq.com>
Co-authored-by: Shangwei Chen <109785802+Somezak1@users.noreply.github.com>
Co-authored-by: HyungJin Ahn <crushed7@o.cnu.ac.kr>
Co-authored-by: zhangsibo1129 <134488188+zhangsibo1129@users.noreply.github.com>
Co-authored-by: Tobias Birchler <tobias@birchlerfamily.ch>
Co-authored-by: Jae-Won Chung <jwnchung@umich.edu>
Co-authored-by: Mingdao Liu <joshua@btlmd.com>
Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Brandon Biggs <brandonsbiggs@gmail.com>
Co-authored-by: dongxiaolong <774848421@qq.com>
Co-authored-by: 董晓龙 <dongxiaolong@shiyanjia.com>
---
 README.md                                     |   5 +-
 docs/commands/leaderboard.md                  |  13 +-
 docs/commands/test_process.md                 |   3 +
 docs/commands/webserver.md                    |   2 +-
 docs/model_support.md                         |   4 +-
 docs/openai_api.md                            |   2 +-
 docs/training.md                              |  29 ++
 docs/vllm_integration.md                      |   5 +
 fastchat/__init__.py                          |   2 +-
 fastchat/constants.py                         |   2 +-
 fastchat/conversation.py                      |  84 +++++-
 fastchat/data/merge.py                        |   1 -
 fastchat/llm_judge/README.md                  |   7 +-
 fastchat/llm_judge/common.py                  |  29 ++
 fastchat/llm_judge/gen_model_answer.py        |  42 ++-
 fastchat/model/compression.py                 |  18 +-
 fastchat/model/model_adapter.py               | 112 +++++---
 fastchat/model/model_codet5p.py               |   2 +
 fastchat/model/model_falcon.py                |   2 +
 fastchat/model/model_registry.py              |  41 ++-
 fastchat/serve/cli.py                         |   3 +
 fastchat/serve/controller.py                  |  20 +-
 fastchat/serve/gradio_block_arena_anony.py    |  89 +++---
 fastchat/serve/gradio_block_arena_named.py    |  71 +++--
 fastchat/serve/gradio_web_server.py           | 119 ++++----
 fastchat/serve/gradio_web_server_multi.py     | 106 +++----
 fastchat/serve/huggingface_api.py             |  10 +-
 fastchat/serve/inference.py                   |  15 +-
 fastchat/serve/launch_all_serve.py            |   2 +-
 fastchat/serve/model_worker.py                |  26 +-
 fastchat/serve/monitor/basic_stats.py         |   2 +-
 fastchat/serve/monitor/clean_battle_data.py   |  19 +-
 fastchat/serve/monitor/clean_chat_data.py     |  31 ++-
 .../arena_33k}/count_unique_users.py          |   0
 .../arena_33k}/filter_bad_conv.py             |   0
 .../arena_33k}/merge_field.py                 |   0
 .../arena_33k}/sample.py                      |   0
 .../arena_33k}/upload_hf_dataset.py           |   0
 .../lmsys_chat_1m/compute_stats.py            | 119 ++++++++
 .../lmsys_chat_1m/filter_bad_conv.py          | 148 ++++++++++
 .../lmsys_chat_1m/final_post_processing.py    |  27 ++
 .../lmsys_chat_1m/instructions.md             |  23 ++
 .../lmsys_chat_1m/merge_oai_tag.py            |  45 +++
 .../lmsys_chat_1m/process_all.sh              |  18 ++
 .../lmsys_chat_1m/sample.py                   |  32 +++
 .../lmsys_chat_1m/upload_hf_dataset.py        |  17 ++
 .../serve/monitor/hf_space_leaderboard_app.py | 258 ------------------
 fastchat/serve/monitor/intersect_conv_file.py |  25 ++
 fastchat/serve/monitor/monitor.py             |  23 +-
 fastchat/serve/monitor/replace_model_name.py  |  21 --
 fastchat/serve/monitor/summarize_cluster.py   |  19 +-
 fastchat/serve/monitor/topic_clustering.py    |  52 +++-
 fastchat/serve/multi_model_worker.py          |  17 +-
 fastchat/serve/openai_api_server.py           |  12 +-
 fastchat/serve/vllm_worker.py                 |   2 +
 fastchat/train/train.py                       |  20 +-
 fastchat/train/train_mem.py                   |   2 +-
 fastchat/utils.py                             |  55 +++-
 pyproject.toml                                |   4 +-
 tests/launch_openai_api_test_server.py        |  22 +-
 tests/test_cli.py                             |   5 +-
 tests/test_openai_api.py                      |  15 +-
 62 files changed, 1226 insertions(+), 673 deletions(-)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/count_unique_users.py (100%)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/filter_bad_conv.py (100%)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/merge_field.py (100%)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/sample.py (100%)
 rename fastchat/serve/monitor/{conv_release_scripts => dataset_release_scripts/arena_33k}/upload_hf_dataset.py (100%)
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/instructions.md
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/merge_oai_tag.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/sample.py
 create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/upload_hf_dataset.py
 delete mode 100644 fastchat/serve/monitor/hf_space_leaderboard_app.py
 create mode 100644 fastchat/serve/monitor/intersect_conv_file.py
 delete mode 100644 fastchat/serve/monitor/replace_model_name.py

diff --git a/README.md b/README.md
index 77658f01b..8e611922e 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,10 @@ We are focused to support Llama2 at scale now. If you want any other models, ple
 
 ## Dev Log
 
+### 2023-09
+
+Sync upstream changes
+
 ### 2023-08
 
 Support llama2 at scale.
@@ -37,4 +41,3 @@ Support "Llama-2-13b-chat-hf" and make it the default for API.
 
 * API key database and rate limit enforcement
 * Deployable on Kubernetes
-
diff --git a/docs/commands/leaderboard.md b/docs/commands/leaderboard.md
index d06aa1a05..0a668f649 100644
--- a/docs/commands/leaderboard.md
+++ b/docs/commands/leaderboard.md
@@ -11,5 +11,16 @@ python3 clean_battle_data.py
 
 ### Run Elo analysis
 ```
-python3 elo_analysis.py --clean-battle-file clean_battle_20230523.json
+python3 elo_analysis.py --clean-battle-file clean_battle_20230905.json
+```
+
+### Copy files to HF space
+1. update plots
+```
+scp atlas:/data/lmzheng/FastChat/fastchat/serve/monitor/elo_results_20230905.pkl .
+```
+
+2. update table
+```
+wget https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard/raw/main/leaderboard_table_20230905.csv
 ```
diff --git a/docs/commands/test_process.md b/docs/commands/test_process.md
index 642ffaa02..804717556 100644
--- a/docs/commands/test_process.md
+++ b/docs/commands/test_process.md
@@ -1,3 +1,6 @@
+## Unit tests for FastChat
+The scripts are under [FastChat/tests](../../tests).
+
 ### Test CLI Inference
 
 ```
diff --git a/docs/commands/webserver.md b/docs/commands/webserver.md
index 920f15aa0..b6342c682 100644
--- a/docs/commands/webserver.md
+++ b/docs/commands/webserver.md
@@ -27,7 +27,7 @@ cd fastchat_logs/server0
 export OPENAI_API_KEY=
 export ANTHROPIC_API_KEY=
 
-python3 -m fastchat.serve.gradio_web_server_multi --controller http://localhost:21001 --concurrency 10 --add-chatgpt --add-claude --add-palm --anony-only --elo ~/elo_results/elo_results_20230802.pkl --leaderboard-table-file ~/elo_results/leaderboard_table_20230802.csv --register ~/elo_results/register_oai_models.json
+python3 -m fastchat.serve.gradio_web_server_multi --controller http://localhost:21001 --concurrency 10 --add-chatgpt --add-claude --add-palm --anony-only --elo ~/elo_results/elo_results.pkl --leaderboard-table-file ~/elo_results/leaderboard_table.csv --register ~/elo_results/register_oai_models.json --show-terms
 
 python3 backup_logs.py
 ```
diff --git a/docs/model_support.md b/docs/model_support.md
index 8c1a58eea..a9eb4c895 100644
--- a/docs/model_support.md
+++ b/docs/model_support.md
@@ -31,6 +31,7 @@
 - [openaccess-ai-collective/manticore-13b-chat-pyg](https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg)
 - [OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5](https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5)
 - [VMware/open-llama-7b-v2-open-instruct](https://huggingface.co/VMware/open-llama-7b-v2-open-instruct)
+- [Phind/Phind-CodeLlama-34B-v2](https://huggingface.co/Phind/Phind-CodeLlama-34B-v2)
 - [project-baize/baize-v2-7b](https://huggingface.co/project-baize/baize-v2-7b)
 - [Qwen/Qwen-7B-Chat](https://huggingface.co/Qwen/Qwen-7B-Chat)
 - [Salesforce/codet5p-6b](https://huggingface.co/Salesforce/codet5p-6b)
@@ -38,6 +39,7 @@
 - [THUDM/chatglm-6b](https://huggingface.co/THUDM/chatglm-6b)
 - [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)
 - [tiiuae/falcon-40b](https://huggingface.co/tiiuae/falcon-40b)
+- [tiiuae/falcon-180B-chat](https://huggingface.co/tiiuae/falcon-180B-chat)
 - [timdettmers/guanaco-33b-merged](https://huggingface.co/timdettmers/guanaco-33b-merged)
 - [togethercomputer/RedPajama-INCITE-7B-Chat](https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat)
 - [WizardLM/WizardLM-13B-V1.0](https://huggingface.co/WizardLM/WizardLM-13B-V1.0)
@@ -71,7 +73,7 @@ You can add `--debug` to see the actual prompt sent to the model.
 
 FastChat uses the `Conversation` class to handle prompt templates and `BaseModelAdapter` class to handle model loading.
 
-1. Implement a conversation template for the new model at [fastchat/conversation.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py). You can follow existing examples and use `register_conv_template` to add a new one.
+1. Implement a conversation template for the new model at [fastchat/conversation.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py). You can follow existing examples and use `register_conv_template` to add a new one. Please also add a link to the official reference code if possible.
 2. Implement a model adapter for the new model at [fastchat/model/model_adapter.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_adapter.py). You can follow existing examples and use `register_model_adapter` to add a new one.
 3. (Optional) add the model name to the "Supported models" [section](#supported-models) above and add more information in [fastchat/model/model_registry.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_registry.py).
 
diff --git a/docs/openai_api.md b/docs/openai_api.md
index f69cc4f00..0c555a60e 100644
--- a/docs/openai_api.md
+++ b/docs/openai_api.md
@@ -62,7 +62,7 @@ completion = openai.ChatCompletion.create(
 print(completion.choices[0].message.content)
 ```
 
-Streaming is also supported. See [test_openai_api.py](../tests/test_openai_api.py).
+Streaming is also supported. See [test_openai_api.py](../tests/test_openai_api.py).  If your api server is behind a proxy you'll need to turn off buffering, you can do so in Nginx by setting `proxy_buffering off;` in the location block for the proxy.
 
 ### cURL
 cURL is another good tool for observing the output of the api.
diff --git a/docs/training.md b/docs/training.md
index 05cbf894d..077221824 100644
--- a/docs/training.md
+++ b/docs/training.md
@@ -87,3 +87,32 @@ deepspeed fastchat/train/train_lora_t5.py \
         --deepspeed playground/deepspeed_config_s2.json
         
 ```
+
+### Fine-tuning Vicuna-7B with Local NPUs
+
+You can use the following command to train Vicuna-7B with 8 x 910B (60GB). Use `--nproc_per_node` to specify the number of NPUs.
+```bash
+torchrun --nproc_per_node=8 --master_port=20001 fastchat/train/train.py \
+    --model_name_or_path ~/vicuna-7b-v1.5-16k  \
+    --data_path data/dummy_conversation.json \
+    --fp16 True \
+    --output_dir output_vicuna \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1200 \
+    --save_total_limit 10 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True
+```
diff --git a/docs/vllm_integration.md b/docs/vllm_integration.md
index 1886b1009..021fc3853 100644
--- a/docs/vllm_integration.md
+++ b/docs/vllm_integration.md
@@ -18,3 +18,8 @@ See the supported models [here](https://vllm.readthedocs.io/en/latest/models/sup
    ```
    python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.3 --tokenizer hf-internal-testing/llama-tokenizer
    ```
+
+   if you use a awq model, try
+   '''
+   python3 -m fastchat.serve.vllm_worker --model-path TheBloke/vicuna-7B-v1.5-AWQ --quantization awq
+   '''
diff --git a/fastchat/__init__.py b/fastchat/__init__.py
index 3b9e925d1..4f6b515ec 100644
--- a/fastchat/__init__.py
+++ b/fastchat/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.26"
+__version__ = "0.2.29"
diff --git a/fastchat/constants.py b/fastchat/constants.py
index 0eb7af371..c26c5f489 100644
--- a/fastchat/constants.py
+++ b/fastchat/constants.py
@@ -15,7 +15,7 @@
 CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
 INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
 # Maximum input length
-INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 2560))
+INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 3072))
 # Maximum conversation turns
 CONVERSATION_TURN_LIMIT = 50
 # Session expiration time
diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index f733be68a..869bfd4bf 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -27,6 +27,7 @@ class SeparatorStyle(IntEnum):
     RWKV = auto()
     PHOENIX = auto()
     ROBIN = auto()
+    FALCON_CHAT = auto()
 
 
 @dataclasses.dataclass
@@ -200,6 +201,17 @@ def get_prompt(self) -> str:
                 else:
                     ret += role + ":\n"
             return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+
+            return ret
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
 
@@ -285,6 +297,17 @@ def get_conv_template(name: str) -> Conversation:
     return conv_templates[name].copy()
 
 
+# An empty template for raw conversation.
+register_conv_template(
+    Conversation(
+        name="raw",
+        system_message="",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+    )
+)
+
 # A template with a one-shot conversation example
 register_conv_template(
     Conversation(
@@ -357,6 +380,17 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+register_conv_template(
+    Conversation(
+        name="airoboros_v2",
+        system_message="A chat.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+
 # Koala default template
 register_conv_template(
     Conversation(
@@ -743,11 +777,10 @@ def get_conv_template(name: str) -> Conversation:
     Conversation(
         name="xgen",
         system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
-        roles=("### Human: ", "###"),
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        roles=("### Human", "### Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
         sep="\n",
-        stop_token_ids=[50256, 0, 1, 2],
-        stop_str="<|endoftext|>",
+        stop_token_ids=[50256],
     )
 )
 
@@ -793,6 +826,20 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+# Baichuan2-13B-Chat template
+register_conv_template(
+    # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
+    # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
+    # https://github.com/baichuan-inc/Baichuan2/issues/62
+    Conversation(
+        name="baichuan2-chat",
+        roles=("<reserved_106>", "<reserved_107>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[],
+    )
+)
+
 # llama2 template
 # reference: https://huggingface.co/blog/codellama#conversational-instructions
 # reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
@@ -905,6 +952,35 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+# Falcon 180B chat template
+# source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
+register_conv_template(
+    Conversation(
+        name="falcon-chat",
+        roles=("User", "Falcon"),
+        system_template="System: {system_message}",
+        messages=[],
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="\n",
+        sep2="<|endoftext|>",
+        stop_str="\nUser:",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
+    )
+)
+
+# Phind template
+# source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
+register_conv_template(
+    Conversation(
+        name="phind",
+        system_message="### System Prompt\nYou are an intelligent programming assistant.",
+        roles=("### User Message", "### Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+    )
+)
+
 
 if __name__ == "__main__":
     print("Vicuna template:")
diff --git a/fastchat/data/merge.py b/fastchat/data/merge.py
index 044401315..0ae63ea76 100644
--- a/fastchat/data/merge.py
+++ b/fastchat/data/merge.py
@@ -6,7 +6,6 @@
 
 import argparse
 import json
-from typing import Dict, Sequence, Optional
 
 
 if __name__ == "__main__":
diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md
index 9338b1b86..f1755e3e5 100644
--- a/fastchat/llm_judge/README.md
+++ b/fastchat/llm_judge/README.md
@@ -1,5 +1,5 @@
 # LLM Judge
-| [Paper](https://arxiv.org/abs/2306.05685) | [Leaderboard](https://chat.lmsys.org/?leaderboard) |
+| [Paper](https://arxiv.org/abs/2306.05685) | [Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) |
 
 In this package, you can use MT-bench questions and prompts to evaluate your models with LLM-as-a-judge.
 MT-bench is a set of challenging multi-turn open-ended questions for evaluating chat assistants.
@@ -10,7 +10,7 @@ To automate the evaluation process, we prompt strong LLMs like GPT-4 to act as j
 - [Review Pre-Generated Model Answers and Judgments](#review-pre-generated-model-answers-and-judgments)
 - [MT-Bench](#mt-bench)
 - [Agreement Computation](#agreement-computation)
-- [Dataset](#dataset)
+- [Datasets](#datasets)
 - [Citation](#citation)
 
 ## Install
@@ -64,6 +64,7 @@ This mode asks GPT-4 to grade and give a score to model's answer directly withou
 For each turn, GPT-4 will give a score on a scale of 10. We then compute the average score on all turns.
 
 ```
+export OPENAI_API_KEY=XXXXXX  # set the OpenAI API key
 python gen_judgment.py --model-list [LIST-OF-MODEL-ID] --parallel [num-concurrent-api-call]
 ```
 
@@ -133,7 +134,7 @@ We released 3.3K human annotations for model responses generated by 6 models in
 
 This Colab [notebook](https://colab.research.google.com/drive/1ctgygDRJhVGUJTQy8-bRZCl1WNcT8De6?usp=sharing) shows how to compute the agreement between humans and GPT-4 judge with the dataset. Our results show that humans and GPT-4 judge achieve over 80\% agreement, the same level of agreement between humans.
 
-## Dataset
+## Datasets
 - [Chatbot Arena Conversation Dataset](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations)
 - [MT-bench Human Annotation Dataset](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments)
 
diff --git a/fastchat/llm_judge/common.py b/fastchat/llm_judge/common.py
index ad1180034..abe1ec6cc 100644
--- a/fastchat/llm_judge/common.py
+++ b/fastchat/llm_judge/common.py
@@ -418,6 +418,35 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens):
     return output
 
 
+def chat_compeletion_openai_azure(model, conv, temperature, max_tokens):
+    openai.api_type = "azure"
+    openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"]
+    openai.api_key = os.environ["AZURE_OPENAI_KEY"]
+    openai.api_version = "2023-05-15"
+
+    if "azure-" in model:
+        model = model[6:]
+
+    output = API_ERROR_OUTPUT
+    for _ in range(API_MAX_RETRY):
+        try:
+            messages = conv.to_openai_api_messages()
+            response = openai.ChatCompletion.create(
+                engine=model,
+                messages=messages,
+                n=1,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+            output = response["choices"][0]["message"]["content"]
+            break
+        except openai.error.OpenAIError as e:
+            print(type(e), e)
+            time.sleep(API_RETRY_SLEEP)
+
+    return output
+
+
 def chat_compeletion_anthropic(model, conv, temperature, max_tokens):
     output = API_ERROR_OUTPUT
     for _ in range(API_MAX_RETRY):
diff --git a/fastchat/llm_judge/gen_model_answer.py b/fastchat/llm_judge/gen_model_answer.py
index 3d093ecd5..c36665b8f 100644
--- a/fastchat/llm_judge/gen_model_answer.py
+++ b/fastchat/llm_judge/gen_model_answer.py
@@ -15,6 +15,7 @@
 
 from fastchat.llm_judge.common import load_questions, temperature_config
 from fastchat.model import load_model, get_conversation_template
+from fastchat.utils import str_to_torch_dtype
 
 
 def run_eval(
@@ -29,6 +30,7 @@ def run_eval(
     num_gpus_per_model,
     num_gpus_total,
     max_gpu_memory,
+    dtype,
 ):
     questions = load_questions(question_file, question_begin, question_end)
     # random shuffle the questions to balance the loading
@@ -45,7 +47,7 @@ def run_eval(
     else:
         get_answers_func = get_model_answers
 
-    chunk_size = len(questions) // (num_gpus_total // num_gpus_per_model) // 2
+    chunk_size = len(questions) // (num_gpus_total // num_gpus_per_model)
     ans_handles = []
     for i in range(0, len(questions), chunk_size):
         ans_handles.append(
@@ -58,6 +60,7 @@ def run_eval(
                 num_choices,
                 num_gpus_per_model,
                 max_gpu_memory,
+                dtype=dtype,
             )
         )
 
@@ -75,12 +78,14 @@ def get_model_answers(
     num_choices,
     num_gpus_per_model,
     max_gpu_memory,
+    dtype,
 ):
     model, tokenizer = load_model(
         model_path,
         device="cuda",
         num_gpus=num_gpus_per_model,
         max_gpu_memory=max_gpu_memory,
+        dtype=dtype,
         load_8bit=False,
         cpu_offloading=False,
         debug=False,
@@ -192,7 +197,9 @@ def reorg_answer_file(answer_file):
         required=True,
         help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
     )
-    parser.add_argument("--model-id", type=str, required=True)
+    parser.add_argument(
+        "--model-id", type=str, required=True, help="A custom name for the model."
+    )
     parser.add_argument(
         "--bench-name",
         type=str,
@@ -234,6 +241,14 @@ def reorg_answer_file(answer_file):
         type=str,
         help="Maxmum GPU memory used for model weights per GPU.",
     )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16"],
+        help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
+        default=None,
+    )
+
     args = parser.parse_args()
 
     if args.num_gpus_total // args.num_gpus_per_model > 1:
@@ -250,17 +265,18 @@ def reorg_answer_file(answer_file):
     print(f"Output to {answer_file}")
 
     run_eval(
-        args.model_path,
-        args.model_id,
-        question_file,
-        args.question_begin,
-        args.question_end,
-        answer_file,
-        args.max_new_token,
-        args.num_choices,
-        args.num_gpus_per_model,
-        args.num_gpus_total,
-        args.max_gpu_memory,
+        model_path=args.model_path,
+        model_id=args.model_id,
+        question_file=question_file,
+        question_begin=args.question_begin,
+        question_end=args.question_end,
+        answer_file=answer_file,
+        max_new_token=args.max_new_token,
+        num_choices=args.num_choices,
+        num_gpus_per_model=args.num_gpus_per_model,
+        num_gpus_total=args.num_gpus_total,
+        max_gpu_memory=args.max_gpu_memory,
+        dtype=str_to_torch_dtype(args.dtype),
     )
 
     reorg_answer_file(answer_file)
diff --git a/fastchat/model/compression.py b/fastchat/model/compression.py
index 4a1d2adb7..e80d9aaba 100644
--- a/fastchat/model/compression.py
+++ b/fastchat/model/compression.py
@@ -11,7 +11,13 @@
 from torch.nn import functional as F
 import torch.nn as nn
 from tqdm import tqdm
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoModel,
+    AutoModelForSeq2SeqLM,
+)
 
 
 @dataclasses.dataclass
@@ -123,7 +129,13 @@ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="mai
         # some models are loaded by AutoModel but not AutoModelForCausalLM,
         # such as chatglm, chatglm2
         try:
-            model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+            # google/flan-* models are based on an AutoModelForSeq2SeqLM.
+            if "T5Config" in str(type(config)):
+                model = AutoModelForSeq2SeqLM.from_config(
+                    config, trust_remote_code=True
+                )
+            else:
+                model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
         except NameError:
             model = AutoModel.from_config(config, trust_remote_code=True)
         linear_weights = get_compressed_list(model)
@@ -181,6 +193,8 @@ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="mai
             torch.cuda.empty_cache()
             if device == "xpu":
                 torch.xpu.empty_cache()
+            if device == "npu":
+                torch.npu.empty_cache()
 
     for name in model.state_dict():
         if name not in linear_weights:
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index d74d9d76a..2bcccb817 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -2,6 +2,7 @@
 
 import math
 import os
+import re
 import sys
 from typing import Dict, List, Optional
 import warnings
@@ -30,9 +31,7 @@
 from fastchat.modules.awq import AWQConfig, load_awq_quantized
 from fastchat.conversation import Conversation, get_conv_template
 from fastchat.model.compression import load_compress_model
-from fastchat.model.llama_condense_monkey_patch import (
-    replace_llama_with_condense,
-)
+from fastchat.model.llama_condense_monkey_patch import replace_llama_with_condense
 from fastchat.model.model_chatglm import generate_stream_chatglm
 from fastchat.model.model_codet5p import generate_stream_codet5p
 from fastchat.model.model_falcon import generate_stream_falcon
@@ -151,6 +150,7 @@ def load_model(
     device: str = "cuda",
     num_gpus: int = 1,
     max_gpu_memory: Optional[str] = None,
+    dtype: Optional[torch.dtype] = None,
     load_8bit: bool = False,
     cpu_offloading: bool = False,
     gptq_config: Optional[GptqConfig] = None,
@@ -206,6 +206,13 @@ def load_model(
             warnings.warn(
                 "Intel Extension for PyTorch is not installed, but is required for xpu inference."
             )
+    elif device == "npu":
+        kwargs = {"torch_dtype": torch.float16}
+        # Try to load ipex, while it looks unused, it links into torch for xpu support
+        try:
+            import torch_npu
+        except ImportError:
+            warnings.warn("Ascend Extension for PyTorch is not installed.")
     else:
         raise ValueError(f"Invalid device: {device}")
 
@@ -275,6 +282,9 @@ def load_model(
         return model, tokenizer
     kwargs["revision"] = revision
 
+    if dtype is not None:  # Overwrite dtype if it is provided in the arguments.
+        kwargs["torch_dtype"] = dtype
+
     # Load model
     model, tokenizer = adapter.load_model(model_path, kwargs)
 
@@ -288,6 +298,7 @@ def load_model(
     if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
         "mps",
         "xpu",
+        "npu",
     ):
         model.to(device)
 
@@ -369,7 +380,7 @@ def add_model_args(parser):
     parser.add_argument(
         "--device",
         type=str,
-        choices=["cpu", "cuda", "mps", "xpu"],
+        choices=["cpu", "cuda", "mps", "xpu", "npu"],
         default="cuda",
         help="The device type",
     )
@@ -385,6 +396,13 @@ def add_model_args(parser):
         type=str,
         help="The maximum memory per GPU for storing model weights. Use a string like '13Gib'",
     )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16"],
+        help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
+        default=None,
+    )
     parser.add_argument(
         "--load-8bit", action="store_true", help="Use 8-bit quantization"
     )
@@ -582,9 +600,13 @@ class AiroborosAdapter(BaseModelAdapter):
     """The model adapter for jondurbin/airoboros-*"""
 
     def match(self, model_path: str):
-        return "airoboros" in model_path.lower()
+        if re.search(r"airoboros|spicyboros", model_path, re.I):
+            return True
+        return False
 
     def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "spicyboros" in model_path or re.search(r"-(2\.[2-9]+)", model_path):
+            return get_conv_template("airoboros_v2")
         return get_conv_template("airoboros_v1")
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
@@ -632,15 +654,18 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("vicuna_v1.1")
 
 
-class CodeT5pAdapter(BaseModelAdapter):
-    """The model adapter for Salesforce/codet5p-6b"""
+class GoogleT5Adapter(BaseModelAdapter):
+    """The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""
 
     def match(self, model_path: str):
-        return "codet5p" in model_path.lower()
+        return any(
+            model_str in model_path.lower()
+            for model_str in ["flan-", "fastchat-t5", "codet5p"]
+        )
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         revision = from_pretrained_kwargs.get("revision", "main")
-        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
         model = AutoModelForSeq2SeqLM.from_pretrained(
             model_path,
             low_cpu_mem_usage=True,
@@ -650,23 +675,8 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         return model, tokenizer
 
 
-class T5Adapter(BaseModelAdapter):
-    """The model adapter for lmsys/fastchat-t5-3b-v1.0"""
-
-    def match(self, model_path: str):
-        return "t5" in model_path.lower()
-
-    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
-        revision = from_pretrained_kwargs.get("revision", "main")
-        tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
-        model = AutoModelForSeq2SeqLM.from_pretrained(
-            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
-        )
-        return model, tokenizer
-
-
 class KoalaAdapter(BaseModelAdapter):
-    """The model adapter for koala"""
+    """The model adapter for Koala"""
 
     use_fast_tokenizer = False
 
@@ -678,7 +688,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class AlpacaAdapter(BaseModelAdapter):
-    """The model adapter for alpaca"""
+    """The model adapter for Alpaca"""
 
     use_fast_tokenizer = False
 
@@ -1113,7 +1123,7 @@ class FalconAdapter(BaseModelAdapter):
     """The model adapter for tiiuae/falcon-40b"""
 
     def match(self, model_path: str):
-        return "falcon" in model_path.lower()
+        return "falcon" in model_path.lower() and "chat" not in model_path.lower()
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         revision = from_pretrained_kwargs.get("revision", "main")
@@ -1134,6 +1144,14 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("falcon")
 
 
+class FalconChatAdapter(BaseModelAdapter):
+    def match(self, model_path: str):
+        return "falcon" in model_path.lower() and "chat" in model_path.lower()
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("falcon-chat")
+
+
 class TigerBotAdapter(BaseModelAdapter):
     """The model adapter for TigerResearch/tigerbot-7b-sft"""
 
@@ -1181,6 +1199,8 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
     def get_default_conv_template(self, model_path: str) -> Conversation:
         # for Baichuan-13B-Chat
         if "chat" in model_path.lower():
+            if "baichuan2" in model_path.lower():
+                return get_conv_template("baichuan2-chat")
             return get_conv_template("baichuan-chat")
         return get_conv_template("zero_shot")
 
@@ -1258,7 +1278,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class Llama2Adapter(BaseModelAdapter):
-    """The model adapter for llama-2"""
+    """The model adapter for Llama-2 (e.g., meta-llama/Llama-2-7b-hf)"""
 
     def match(self, model_path: str):
         return "llama-2" in model_path.lower()
@@ -1274,7 +1294,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class CuteGPTAdapter(BaseModelAdapter):
-    """The model adapter for llama-2"""
+    """The model adapter for CuteGPT"""
 
     def match(self, model_path: str):
         return "cutegpt" in model_path.lower()
@@ -1318,7 +1338,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class WizardCoderAdapter(BaseModelAdapter):
-    """The model adapter for WizardCoder"""
+    """The model adapter for WizardCoder (e.g., WizardLM/WizardCoder-Python-34B-V1.0)"""
 
     use_fast_tokenizer = False
 
@@ -1360,7 +1380,8 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
             model_path,
             trust_remote_code=True,
         )
-        config.use_flash_attn = False
+        # NOTE: if you use the old version of model file, please remove the comments below
+        # config.use_flash_attn = False
         config.fp16 = True
         generation_config = GenerationConfig.from_pretrained(
             model_path, trust_remote_code=True
@@ -1391,7 +1412,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class BGEAdapter(BaseModelAdapter):
-    """The model adapter for BGE"""
+    """The model adapter for BGE (e.g., BAAI/bge-large-en-v1.5)"""
 
     use_fast_tokenizer = False
 
@@ -1420,12 +1441,12 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class E5Adapter(BaseModelAdapter):
-    """The model adapter for E5"""
+    """The model adapter for E5 (e.g., intfloat/e5-large-v2)"""
 
     use_fast_tokenizer = False
 
     def match(self, model_path: str):
-        return "e5" in model_path.lower()
+        return "e5-" in model_path.lower()
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         revision = from_pretrained_kwargs.get("revision", "main")
@@ -1498,7 +1519,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class VigogneInstructAdapter(BaseModelAdapter):
-    """The model adapter for Vigogne-Instruct"""
+    """The model adapter for Vigogne-Instruct (e.g., bofenghuang/vigogne-2-7b-instruct)"""
 
     use_fast_tokenizer = False
 
@@ -1526,7 +1547,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class VigogneChatAdapter(BaseModelAdapter):
-    """The model adapter for Vigogne-Chat"""
+    """The model adapter for Vigogne-Chat (e.g., bofenghuang/vigogne-7b-chat)"""
 
     use_fast_tokenizer = False
 
@@ -1554,7 +1575,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class OpenLLaMaOpenInstructAdapter(BaseModelAdapter):
-    """The model adapter for OpenLLaMa-Open-Instruct"""
+    """The model adapter for OpenLLaMa-Open-Instruct (e.g., VMware/open-llama-7b-open-instruct)"""
 
     use_fast_tokenizer = False
 
@@ -1584,7 +1605,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class CodeLlamaAdapter(BaseModelAdapter):
-    """The model adapter for Code Llama"""
+    """The model adapter for CodeLlama (e.g., codellama/CodeLlama-34b-hf)"""
 
     def match(self, model_path: str):
         return "codellama" in model_path.lower()
@@ -1599,14 +1620,23 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("llama-2")
 
 
+class PhindCodeLlamaAdapter(CodeLlamaAdapter):
+    """The model adapter for Phind-CodeLlama (e.g., Phind/Phind-CodeLlama-34B-v2)"""
+
+    def match(self, model_path: str):
+        return "phind-codellama-" in model_path.lower()
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("phind")
+
+
 # Note: the registration order matters.
 # The one registered earlier has a higher matching priority.
 register_model_adapter(PeftModelAdapter)
 register_model_adapter(VicunaAdapter)
 register_model_adapter(AiroborosAdapter)
 register_model_adapter(LongChatAdapter)
-register_model_adapter(CodeT5pAdapter)
-register_model_adapter(T5Adapter)
+register_model_adapter(GoogleT5Adapter)
 register_model_adapter(KoalaAdapter)
 register_model_adapter(AlpacaAdapter)
 register_model_adapter(ChatGLMAdapter)
@@ -1634,6 +1664,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(CamelAdapter)
 register_model_adapter(ChangGPTAdapter)
 register_model_adapter(TuluAdapter)
+register_model_adapter(FalconChatAdapter)
 register_model_adapter(FalconAdapter)
 register_model_adapter(TigerBotAdapter)
 register_model_adapter(BaichuanAdapter)
@@ -1655,6 +1686,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(VigogneChatAdapter)
 register_model_adapter(OpenLLaMaOpenInstructAdapter)
 register_model_adapter(ReaLMAdapter)
+register_model_adapter(PhindCodeLlamaAdapter)
 register_model_adapter(CodeLlamaAdapter)
 
 # After all adapters, try the default base adapter.
diff --git a/fastchat/model/model_codet5p.py b/fastchat/model/model_codet5p.py
index 63481bc5e..0984513c9 100644
--- a/fastchat/model/model_codet5p.py
+++ b/fastchat/model/model_codet5p.py
@@ -104,3 +104,5 @@ def __call__(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
diff --git a/fastchat/model/model_falcon.py b/fastchat/model/model_falcon.py
index 20afc4f0f..dc8af8efa 100644
--- a/fastchat/model/model_falcon.py
+++ b/fastchat/model/model_falcon.py
@@ -136,3 +136,5 @@ def generate_stream_falcon(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py
index 96e2e768d..19a513eaa 100644
--- a/fastchat/model/model_registry.py
+++ b/fastchat/model/model_registry.py
@@ -19,7 +19,13 @@ def register_model_info(
 
 
 def get_model_info(name: str) -> ModelInfo:
-    return model_info[name]
+    if name in model_info:
+        return model_info[name]
+    else:
+        # To fix this, please use `register_model_info` to register your model
+        return ModelInfo(
+            name, "", "Register the description at fastchat/model/model_registry.py"
+        )
 
 
 register_model_info(
@@ -81,7 +87,7 @@ def get_model_info(name: str) -> ModelInfo:
     "a chat assistant fine-tuned from LLaMA on user-shared conversations by LMSYS",
 )
 register_model_info(
-    ["wizardlm-13b"],
+    ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"],
     "WizardLM",
     "https://github.com/nlpxucan/WizardLM",
     "an instruction-following LLM using evol-instruct by Microsoft",
@@ -214,15 +220,25 @@ def get_model_info(name: str) -> ModelInfo:
 )
 register_model_info(
     [
-        "airoboros-7b-gpt4-1.4",
-        "airoboros-13b-gpt4-1.4",
-        "airoboros-33b-gpt4-1.4",
-        "airoboros-65b-gpt4-1.4",
+        "airoboros-l2-7b-2.1",
+        "airoboros-l2-13b-2.1",
+        "airoboros-c34b-2.1",
+        "airoboros-l2-70b-2.1",
     ],
     "airoboros",
-    "https://huggingface.co/jondurbin/airoboros-33b-gpt4-1.4",
+    "https://huggingface.co/jondurbin/airoboros-l2-70b-2.1",
     "an instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4",
 )
+register_model_info(
+    [
+        "spicyboros-7b-2.2",
+        "spicyboros-13b-2.2",
+        "spicyboros-70b-2.2",
+    ],
+    "spicyboros",
+    "https://huggingface.co/jondurbin/spicyboros-70b-2.2",
+    "de-aligned versions of the airoboros models",
+)
 register_model_info(
     ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"],
     "Robin-v2",
@@ -242,9 +258,16 @@ def get_model_info(name: str) -> ModelInfo:
     "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together",
 )
 register_model_info(
-    ["falcon-7b", "falcon-7b-instruct", "falcon-40b", "falcon-40b-instruct"],
+    [
+        "falcon-7b",
+        "falcon-7b-instruct",
+        "falcon-40b",
+        "falcon-40b-instruct",
+        "falcon-180b",
+        "falcon-180b-chat",
+    ],
     "Falcon",
-    "https://huggingface.co/tiiuae/falcon-40b",
+    "https://huggingface.co/tiiuae/falcon-180B",
     "TII's flagship series of large language models",
 )
 register_model_info(
diff --git a/fastchat/serve/cli.py b/fastchat/serve/cli.py
index 8b3ce84cc..09e079e5f 100644
--- a/fastchat/serve/cli.py
+++ b/fastchat/serve/cli.py
@@ -26,11 +26,13 @@
 from rich.console import Console
 from rich.live import Live
 from rich.markdown import Markdown
+import torch
 
 from fastchat.model.model_adapter import add_model_args
 from fastchat.modules.gptq import GptqConfig
 from fastchat.modules.awq import AWQConfig
 from fastchat.serve.inference import ChatIO, chat_loop
+from fastchat.utils import str_to_torch_dtype
 
 
 class SimpleChatIO(ChatIO):
@@ -208,6 +210,7 @@ def main(args):
             args.device,
             args.num_gpus,
             args.max_gpu_memory,
+            str_to_torch_dtype(args.dtype),
             args.load_8bit,
             args.cpu_offloading,
             args.conv_template,
diff --git a/fastchat/serve/controller.py b/fastchat/serve/controller.py
index 04f119f72..a67da62c4 100644
--- a/fastchat/serve/controller.py
+++ b/fastchat/serve/controller.py
@@ -8,6 +8,7 @@
 from enum import Enum, auto
 import json
 import logging
+import os
 import time
 from typing import List, Union
 import threading
@@ -318,6 +319,13 @@ def create_controller():
         choices=["lottery", "shortest_queue"],
         default="shortest_queue",
     )
+    parser.add_argument(
+        "--ssl",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.",
+    )
     args = parser.parse_args()
     logger.info(f"args: {args}")
 
@@ -327,4 +335,14 @@ def create_controller():
 
 if __name__ == "__main__":
     args, controller = create_controller()
-    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
+    if args.ssl:
+        uvicorn.run(
+            app,
+            host=args.host,
+            port=args.port,
+            log_level="info",
+            ssl_keyfile=os.environ["SSL_KEYFILE"],
+            ssl_certfile=os.environ["SSL_CERTFILE"],
+        )
+    else:
+        uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index e20bdcd78..edd89d072 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -25,6 +25,7 @@
     no_change_btn,
     enable_btn,
     disable_btn,
+    invisible_btn,
     acknowledgment_md,
     ip_expiration_dict,
 )
@@ -56,18 +57,7 @@ def load_demo_side_by_side_anony(models_, url_params):
         gr.Markdown.update(visible=True),
     )
 
-    return (
-        states
-        + selector_updates
-        + (gr.Chatbot.update(visible=True),) * num_sides
-        + (
-            gr.Textbox.update(visible=True),
-            gr.Box.update(visible=True),
-            gr.Row.update(visible=True),
-            gr.Row.update(visible=True),
-            gr.Accordion.update(visible=True),
-        )
-    )
+    return states + selector_updates
 
 
 def vote_last_response(states, vote_type, model_selectors, request: gr.Request):
@@ -148,7 +138,12 @@ def regenerate(state0, state1, request: gr.Request):
 def clear_history(request: gr.Request):
     logger.info(f"clear_history (anony). ip: {request.client.host}")
     return (
-        [None] * num_sides + [None] * num_sides + anony_names + [""] + [disable_btn] * 6
+        [None] * num_sides
+        + [None] * num_sides
+        + anony_names
+        + [""]
+        + [invisible_btn] * 4
+        + [disable_btn] * 2
     )
 
 
@@ -174,17 +169,17 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "vicuna-33b": 1.5,
     "vicuna-13b": 1.5,
     "mpt-30b-chat": 1.5,
+    "wizardlm-70b": 1.5,
     "wizardlm-13b": 1.5,
     # tier 2
     "codellama-13b-instruct": 1.0,
-    "guanaco-33b": 1.0,
     "vicuna-7b": 1.0,
     "llama-2-7b-chat": 1.0,
-    # tier 3
+    "chatglm2-6b": 1.0,
+    # deprecated
+    "guanaco-33b": 1.0,
     "fastchat-t5-3b": 0.5,
     "alpaca-13b": 0.5,
-    "chatglm2-6b": 0.5,
-    # deprecated
     "mpt-7b-chat": 0.1,
     "oasst-pythia-12b": 0.1,
     "RWKV-4-Raven-14B": 0.1,
@@ -196,7 +191,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "chatglm-6b": 0.5,
 }
 
-SAMPLING_BOOST_MODELS = ["llama-2-70b-chat"]
+SAMPLING_BOOST_MODELS = []
 
 model_pairs = []
 model_pairs_weights = []
@@ -372,21 +367,19 @@ def bot_response_multi(
 def build_side_by_side_ui_anony(models):
     notice_markdown = """
 # ⚔️  Chatbot Arena ⚔️ : Benchmarking LLMs in the Wild
+| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
+
 ### Rules
 - Chat with two anonymous models side-by-side and vote for which one is better!
 - You can do multiple turns of conversations before voting.
 - The names of the models will be revealed after your vote. Conversations with identity keywords (e.g., ChatGPT, Bard, Vicuna) or any votes after the names are revealed will not count towards the leaderboard.
 - Click "Clear history" to start a new round.
-- | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 
 ### Leaderboard
 See [lmsys/chatbot-arena-leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) or the 4th tab above on this page.
 
-### Terms of use
-By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. **The service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license.** The demo works better on desktop devices with a wide screen.
-
 ### Battle
-Please scroll down and start chatting. The models include both closed-source models (e.g., ChatGPT) and open-source models (e.g., Llama, Vicuna).
+Please scroll down and start chatting. The models include both closed-source models (e.g., ChatGPT) and open-source models (e.g., Llama).
 """
 
     states = [gr.State() for _ in range(num_sides)]
@@ -398,41 +391,46 @@ def build_side_by_side_ui_anony(models):
     with gr.Box(elem_id="share-region-anony"):
         with gr.Row():
             for i in range(num_sides):
+                label = "Model A" if i == 0 else "Model B"
                 with gr.Column():
-                    model_selectors[i] = gr.Markdown(anony_names[i])
+                    chatbots[i] = gr.Chatbot(
+                        label=label, elem_id=f"chatbot", height=550
+                    )
 
         with gr.Row():
             for i in range(num_sides):
-                label = "Model A" if i == 0 else "Model B"
                 with gr.Column():
-                    chatbots[i] = gr.Chatbot(
-                        label=label, elem_id=f"chatbot", visible=False, height=550
-                    )
+                    model_selectors[i] = gr.Markdown(anony_names[i])
 
-        with gr.Box() as button_row:
-            with gr.Row():
-                leftvote_btn = gr.Button(value="👈  A is better", interactive=False)
-                rightvote_btn = gr.Button(value="👉  B is better", interactive=False)
-                tie_btn = gr.Button(value="🤝  Tie", interactive=False)
-                bothbad_btn = gr.Button(value="👎  Both are bad", interactive=False)
+        with gr.Row():
+            leftvote_btn = gr.Button(
+                value="👈  A is better", visible=False, interactive=False
+            )
+            rightvote_btn = gr.Button(
+                value="👉  B is better", visible=False, interactive=False
+            )
+            tie_btn = gr.Button(value="🤝  Tie", visible=False, interactive=False)
+            bothbad_btn = gr.Button(
+                value="👎  Both are bad", visible=False, interactive=False
+            )
 
     with gr.Row():
         with gr.Column(scale=20):
             textbox = gr.Textbox(
                 show_label=False,
-                placeholder="Enter text and press ENTER",
-                visible=False,
+                placeholder="Enter your prompt here and press ENTER",
                 container=False,
+                elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False)
+            send_btn = gr.Button(value="Send", variant="primary")
 
-    with gr.Row() as button_row2:
-        regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+    with gr.Row() as button_row:
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
+        regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         share_btn = gr.Button(value="📷  Share")
 
-    with gr.Accordion("Parameters", open=False, visible=True) as parameter_row:
+    with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
             maximum=1.0,
@@ -548,13 +546,4 @@ def build_side_by_side_ui_anony(models):
         flash_buttons, [], btn_list
     )
 
-    return (
-        states,
-        model_selectors,
-        chatbots,
-        textbox,
-        send_btn,
-        button_row,
-        button_row2,
-        parameter_row,
-    )
+    return states + model_selectors
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
index b26172f3e..6c2d0b534 100644
--- a/fastchat/serve/gradio_block_arena_named.py
+++ b/fastchat/serve/gradio_block_arena_named.py
@@ -24,6 +24,7 @@
     no_change_btn,
     enable_btn,
     disable_btn,
+    invisible_btn,
     acknowledgment_md,
     get_model_description_md,
     ip_expiration_dict,
@@ -61,18 +62,7 @@ def load_demo_side_by_side_named(models, url_params):
         gr.Dropdown.update(choices=models, value=model_right, visible=True),
     )
 
-    return (
-        states
-        + selector_updates
-        + (gr.Chatbot.update(visible=True),) * num_sides
-        + (
-            gr.Textbox.update(visible=True),
-            gr.Box.update(visible=True),
-            gr.Row.update(visible=True),
-            gr.Row.update(visible=True),
-            gr.Accordion.update(visible=True),
-        )
-    )
+    return states + selector_updates
 
 
 def vote_last_response(states, vote_type, model_selectors, request: gr.Request):
@@ -137,7 +127,13 @@ def regenerate(state0, state1, request: gr.Request):
 
 def clear_history(request: gr.Request):
     logger.info(f"clear_history (named). ip: {request.client.host}")
-    return [None] * num_sides + [None] * num_sides + [""] + [disable_btn] * 6
+    return (
+        [None] * num_sides
+        + [None] * num_sides
+        + [""]
+        + [invisible_btn] * 4
+        + [disable_btn] * 2
+    )
 
 
 def share_click(state0, state1, model_selector0, model_selector1, request: gr.Request):
@@ -299,17 +295,18 @@ def flash_buttons():
 def build_side_by_side_ui_named(models):
     notice_markdown = """
 # ⚔️  Chatbot Arena ⚔️ : Benchmarking LLMs in the Wild
+| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
+
 ### Rules
 - Chat with two models side-by-side and vote for which one is better!
 - You pick the models you want to chat with.
 - You can do multiple turns of conversations before voting.
 - Click "Clear history" to start a new round.
-- | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 
-### Terms of use
-By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. **The service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license.** The demo works better on desktop devices with a wide screen.
+### Leaderboard
+See [lmsys/chatbot-arena-leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) or the 4th tab above on this page.
 
-### Choose two models to chat with (view [leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard))
+### Choose two models to chat with
 """
 
     states = [gr.State() for _ in range(num_sides)]
@@ -338,33 +335,38 @@ def build_side_by_side_ui_named(models):
                 label = "Model A" if i == 0 else "Model B"
                 with gr.Column():
                     chatbots[i] = gr.Chatbot(
-                        label=label, elem_id=f"chatbot", visible=False, height=550
+                        label=label, elem_id=f"chatbot", height=550
                     )
 
-        with gr.Box() as button_row:
-            with gr.Row():
-                leftvote_btn = gr.Button(value="👈  A is better", interactive=False)
-                rightvote_btn = gr.Button(value="👉  B is better", interactive=False)
-                tie_btn = gr.Button(value="🤝  Tie", interactive=False)
-                bothbad_btn = gr.Button(value="👎  Both are bad", interactive=False)
+        with gr.Row():
+            leftvote_btn = gr.Button(
+                value="👈  A is better", visible=False, interactive=False
+            )
+            rightvote_btn = gr.Button(
+                value="👉  B is better", visible=False, interactive=False
+            )
+            tie_btn = gr.Button(value="🤝  Tie", visible=False, interactive=False)
+            bothbad_btn = gr.Button(
+                value="👎  Both are bad", visible=False, interactive=False
+            )
 
     with gr.Row():
         with gr.Column(scale=20):
             textbox = gr.Textbox(
                 show_label=False,
-                placeholder="Enter text and press ENTER",
-                visible=False,
+                placeholder="Enter your prompt here and press ENTER",
                 container=False,
+                elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False)
+            send_btn = gr.Button(value="Send", variant="primary")
 
-    with gr.Row() as button_row2:
+    with gr.Row() as button_row:
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
         share_btn = gr.Button(value="📷  Share")
 
-    with gr.Accordion("Parameters", open=False, visible=True) as parameter_row:
+    with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
             maximum=1.0,
@@ -482,13 +484,4 @@ def build_side_by_side_ui_named(models):
         flash_buttons, [], btn_list
     )
 
-    return (
-        states,
-        model_selectors,
-        chatbots,
-        textbox,
-        send_btn,
-        button_row,
-        button_row2,
-        parameter_row,
-    )
+    return states + model_selectors
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
index 29134dff4..24db98b34 100644
--- a/fastchat/serve/gradio_web_server.py
+++ b/fastchat/serve/gradio_web_server.py
@@ -28,7 +28,7 @@
     SESSION_EXPIRATION_TIME,
 )
 from fastchat.model.model_adapter import get_conversation_template
-from fastchat.model.model_registry import model_info
+from fastchat.model.model_registry import get_model_info, model_info
 from fastchat.serve.api_provider import (
     anthropic_api_stream_iter,
     openai_api_stream_iter,
@@ -39,6 +39,7 @@
     build_logger,
     violates_moderation,
     get_window_url_params_js,
+    get_window_url_params_with_tos_js,
     parse_gradio_auth_creds,
 )
 
@@ -48,14 +49,22 @@
 headers = {"User-Agent": "FastChat Client"}
 
 no_change_btn = gr.Button.update()
-enable_btn = gr.Button.update(interactive=True)
+enable_btn = gr.Button.update(interactive=True, visible=True)
 disable_btn = gr.Button.update(interactive=False)
+invisible_btn = gr.Button.update(interactive=False, visible=False)
 
 controller_url = None
 enable_moderation = False
 
 acknowledgment_md = """
-**Acknowledgment:** We thank Kaggle, MBZUAI, and AnyScale for their sponsorship.
+### Acknowledgment
+<div class="image-container">
+    <p> We thank <a href="https://www.kaggle.com/" target="_blank">Kaggle</a>, <a href="https://mbzuai.ac.ae/" target="_blank">MBZUAI</a>, <a href="https://www.anyscale.com/" target="_blank">AnyScale</a>, and <a href="https://huggingface.co/" target="_blank">HuggingFace</a> for their <a href="https://lmsys.org/donations/" target="_blank">sponsorship</a>. </p>
+    <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/7c/Kaggle_logo.png/400px-Kaggle_logo.png" alt="Image 1">
+    <img src="https://mma.prnewswire.com/media/1227419/MBZUAI_Logo.jpg?p=facebookg" alt="Image 2">
+    <img src="https://docs.anyscale.com/site-assets/logo.png" alt="Image 3">
+    <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-title.png" alt="Image 4">
+</div>
 """
 
 ip_expiration_dict = defaultdict(lambda: 0)
@@ -155,15 +164,7 @@ def load_demo_single(models, url_params):
     )
 
     state = None
-    return (
-        state,
-        dropdown_update,
-        gr.Chatbot.update(visible=True),
-        gr.Textbox.update(visible=True),
-        gr.Button.update(visible=True),
-        gr.Row.update(visible=True),
-        gr.Accordion.update(visible=True),
-    )
+    return state, dropdown_update
 
 
 def load_demo(url_params, request: gr.Request):
@@ -494,9 +495,23 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request)
 #leaderboard_dataframe td {
     line-height: 0.1em;
 }
+#input_box textarea {
+}
 footer {
     display:none !important
 }
+.image-container {
+    display: flex;
+    align-items: center;
+    padding: 1px;
+}
+.image-container img {
+    margin: 0 30px;
+    height: 20px;
+    max-height: 100%;
+    width: auto;
+    max-width: 20%;
+}
 """
 
 
@@ -508,17 +523,11 @@ def get_model_description_md(models):
     ct = 0
     visited = set()
     for i, name in enumerate(models):
-        if name in model_info:
-            minfo = model_info[name]
-            if minfo.simple_name in visited:
-                continue
-            visited.add(minfo.simple_name)
-            one_model_md = f"[{minfo.simple_name}]({minfo.link}): {minfo.description}"
-        else:
-            visited.add(name)
-            one_model_md = (
-                f"[{name}](): Add the description at fastchat/model/model_registry.py"
-            )
+        minfo = get_model_info(name)
+        if minfo.simple_name in visited:
+            continue
+        visited.add(minfo.simple_name)
+        one_model_md = f"[{minfo.simple_name}]({minfo.link}): {minfo.description}"
 
         if ct % 3 == 0:
             model_description_md += "|"
@@ -532,9 +541,9 @@ def get_model_description_md(models):
 def build_single_model_ui(models, add_promotion_links=False):
     promotion = (
         """
+- | [GitHub](https://github.com/lm-sys/FastChat) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 - Introducing Llama 2: The Next Generation Open Source Large Language Model. [[Website]](https://ai.meta.com/llama/)
 - Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90% ChatGPT Quality. [[Blog]](https://lmsys.org/blog/2023-03-30-vicuna/)
-- | [GitHub](https://github.com/lm-sys/FastChat) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 """
         if add_promotion_links
         else ""
@@ -544,9 +553,6 @@ def build_single_model_ui(models, add_promotion_links=False):
 # 🏔️ Chat with Open Large Language Models
 {promotion}
 
-### Terms of use
-By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. **The service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license.**
-
 ### Choose a model to chat with
 """
 
@@ -566,28 +572,27 @@ def build_single_model_ui(models, add_promotion_links=False):
     chatbot = gr.Chatbot(
         elem_id="chatbot",
         label="Scroll down and start chatting",
-        visible=False,
         height=550,
     )
     with gr.Row():
         with gr.Column(scale=20):
             textbox = gr.Textbox(
                 show_label=False,
-                placeholder="Enter text and press ENTER",
-                visible=False,
+                placeholder="Enter your prompt here and press ENTER",
                 container=False,
+                elem_id="input_box",
             )
         with gr.Column(scale=1, min_width=50):
-            send_btn = gr.Button(value="Send", visible=False)
+            send_btn = gr.Button(value="Send", variant="primary")
 
-    with gr.Row(visible=False) as button_row:
+    with gr.Row() as button_row:
         upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
         downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
         flag_btn = gr.Button(value="⚠️  Flag", interactive=False)
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
 
-    with gr.Accordion("Parameters", open=False, visible=False) as parameter_row:
+    with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
             maximum=1.0,
@@ -650,49 +655,44 @@ def build_single_model_ui(models, add_promotion_links=False):
         [state, chatbot] + btn_list,
     )
     send_btn.click(
-        add_text, [state, model_selector, textbox], [state, chatbot, textbox] + btn_list
+        add_text,
+        [state, model_selector, textbox],
+        [state, chatbot, textbox] + btn_list,
     ).then(
         bot_response,
         [state, temperature, top_p, max_output_tokens],
         [state, chatbot] + btn_list,
     )
 
-    return state, model_selector, chatbot, textbox, send_btn, button_row, parameter_row
+    return [state, model_selector]
 
 
 def build_demo(models):
     with gr.Blocks(
         title="Chat with Open Large Language Models",
-        theme=gr.themes.Base(),
+        theme=gr.themes.Default(),
         css=block_css,
     ) as demo:
         url_params = gr.JSON(visible=False)
 
-        (
-            state,
-            model_selector,
-            chatbot,
-            textbox,
-            send_btn,
-            button_row,
-            parameter_row,
-        ) = build_single_model_ui(models)
+        state, model_selector = build_single_model_ui(models)
 
         if args.model_list_mode not in ["once", "reload"]:
             raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
+
+        if args.show_terms_of_use:
+            load_js = get_window_url_params_with_tos_js
+        else:
+            load_js = get_window_url_params_js
+
         demo.load(
             load_demo,
             [url_params],
             [
                 state,
                 model_selector,
-                chatbot,
-                textbox,
-                send_btn,
-                button_row,
-                parameter_row,
             ],
-            _js=get_window_url_params_js,
+            _js=load_js,
         )
 
     return demo
@@ -705,29 +705,36 @@ def build_demo(models):
     parser.add_argument(
         "--share",
         action="store_true",
-        help="Whether to generate a public, shareable link.",
+        help="Whether to generate a public, shareable link",
     )
     parser.add_argument(
         "--controller-url",
         type=str,
         default="http://localhost:21001",
-        help="The address of the controller.",
+        help="The address of the controller",
     )
     parser.add_argument(
         "--concurrency-count",
         type=int,
         default=10,
-        help="The concurrency count of the gradio queue.",
+        help="The concurrency count of the gradio queue",
     )
     parser.add_argument(
         "--model-list-mode",
         type=str,
         default="once",
         choices=["once", "reload"],
-        help="Whether to load the model list once or reload the model list every time.",
+        help="Whether to load the model list once or reload the model list every time",
     )
     parser.add_argument(
-        "--moderate", action="store_true", help="Enable content moderation"
+        "--moderate",
+        action="store_true",
+        help="Enable content moderation to block unsafe inputs",
+    )
+    parser.add_argument(
+        "--show-terms-of-use",
+        action="store_true",
+        help="Shows term of use before loading the demo",
     )
     parser.add_argument(
         "--add-chatgpt",
diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py
index 09e227f90..92618d911 100644
--- a/fastchat/serve/gradio_web_server_multi.py
+++ b/fastchat/serve/gradio_web_server_multi.py
@@ -34,6 +34,7 @@
 from fastchat.utils import (
     build_logger,
     get_window_url_params_js,
+    get_window_url_params_with_tos_js,
     parse_gradio_auth_creds,
 )
 
@@ -100,77 +101,20 @@ def load_demo(url_params, request: gr.Request):
 def build_demo(models, elo_results_file, leaderboard_table_file):
     with gr.Blocks(
         title="Chat with Open Large Language Models",
-        theme=gr.themes.Base(),
+        theme=gr.themes.Default(),
         css=block_css,
     ) as demo:
         with gr.Tabs() as tabs:
             with gr.Tab("Chatbot Arena (battle)", id=0):
-                (
-                    b_states,
-                    b_model_selectors,
-                    b_chatbots,
-                    b_textbox,
-                    b_send_btn,
-                    b_button_row,
-                    b_button_row2,
-                    b_parameter_row,
-                ) = build_side_by_side_ui_anony(models)
-                b_list = (
-                    b_states
-                    + b_model_selectors
-                    + b_chatbots
-                    + [
-                        b_textbox,
-                        b_send_btn,
-                        b_button_row,
-                        b_button_row2,
-                        b_parameter_row,
-                    ]
-                )
+                side_by_side_anony_list = build_side_by_side_ui_anony(models)
 
             with gr.Tab("Chatbot Arena (side-by-side)", id=1):
-                (
-                    c_states,
-                    c_model_selectors,
-                    c_chatbots,
-                    c_textbox,
-                    c_send_btn,
-                    c_button_row,
-                    c_button_row2,
-                    c_parameter_row,
-                ) = build_side_by_side_ui_named(models)
-                c_list = (
-                    c_states
-                    + c_model_selectors
-                    + c_chatbots
-                    + [
-                        c_textbox,
-                        c_send_btn,
-                        c_button_row,
-                        c_button_row2,
-                        c_parameter_row,
-                    ]
-                )
+                side_by_side_named_list = build_side_by_side_ui_named(models)
 
             with gr.Tab("Single Model", id=2):
-                (
-                    a_state,
-                    a_model_selector,
-                    a_chatbot,
-                    a_textbox,
-                    a_send_btn,
-                    a_button_row,
-                    a_parameter_row,
-                ) = build_single_model_ui(models, add_promotion_links=True)
-                a_list = [
-                    a_state,
-                    a_model_selector,
-                    a_chatbot,
-                    a_textbox,
-                    a_send_btn,
-                    a_button_row,
-                    a_parameter_row,
-                ]
+                single_model_list = build_single_model_ui(
+                    models, add_promotion_links=True
+                )
 
             if elo_results_file:
                 with gr.Tab("Leaderboard", id=3):
@@ -180,11 +124,20 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
 
         if args.model_list_mode not in ["once", "reload"]:
             raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
+
+        if args.show_terms_of_use:
+            load_js = get_window_url_params_with_tos_js
+        else:
+            load_js = get_window_url_params_js
+
         demo.load(
             load_demo,
             [url_params],
-            [tabs] + a_list + b_list + c_list,
-            _js=get_window_url_params_js,
+            [tabs]
+            + single_model_list
+            + side_by_side_anony_list
+            + side_by_side_named_list,
+            _js=load_js,
         )
 
     return demo
@@ -197,19 +150,19 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
     parser.add_argument(
         "--share",
         action="store_true",
-        help="Whether to generate a public, shareable link.",
+        help="Whether to generate a public, shareable link",
     )
     parser.add_argument(
         "--controller-url",
         type=str,
         default="http://localhost:21001",
-        help="The address of the controller.",
+        help="The address of the controller",
     )
     parser.add_argument(
         "--concurrency-count",
         type=int,
         default=10,
-        help="The concurrency count of the gradio queue.",
+        help="The concurrency count of the gradio queue",
     )
     parser.add_argument(
         "--model-list-mode",
@@ -219,7 +172,14 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
         help="Whether to load the model list once or reload the model list every time.",
     )
     parser.add_argument(
-        "--moderate", action="store_true", help="Enable content moderation"
+        "--moderate",
+        action="store_true",
+        help="Enable content moderation to block unsafe inputs",
+    )
+    parser.add_argument(
+        "--show-terms-of-use",
+        action="store_true",
+        help="Shows term of use before loading the demo",
     )
     parser.add_argument(
         "--add-chatgpt",
@@ -252,8 +212,12 @@ def build_demo(models, elo_results_file, leaderboard_table_file):
         help='Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3"',
         default=None,
     )
-    parser.add_argument("--elo-results-file", type=str)
-    parser.add_argument("--leaderboard-table-file", type=str)
+    parser.add_argument(
+        "--elo-results-file", type=str, help="Load leaderboard results and plots"
+    )
+    parser.add_argument(
+        "--leaderboard-table-file", type=str, help="Load leaderboard results and plots"
+    )
     args = parser.parse_args()
     logger.info(f"args: {args}")
 
diff --git a/fastchat/serve/huggingface_api.py b/fastchat/serve/huggingface_api.py
index 47dcb87b1..5a4c30fec 100644
--- a/fastchat/serve/huggingface_api.py
+++ b/fastchat/serve/huggingface_api.py
@@ -6,16 +6,15 @@
 python3 -m fastchat.serve.huggingface_api --model lmsys/fastchat-t5-3b-v1.0
 """
 import argparse
-import json
 
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 
 from fastchat.model import load_model, get_conversation_template, add_model_args
 
 
 @torch.inference_mode()
 def main(args):
+    # Load model
     model, tokenizer = load_model(
         args.model_path,
         device=args.device,
@@ -27,15 +26,15 @@ def main(args):
         debug=args.debug,
     )
 
+    # Build the prompt with a conversation template
     msg = args.message
-
     conv = get_conversation_template(args.model_path)
     conv.append_message(conv.roles[0], msg)
     conv.append_message(conv.roles[1], None)
     prompt = conv.get_prompt()
 
-    inputs = tokenizer([prompt])
-    inputs = {k: torch.tensor(v).to(args.device) for k, v in inputs.items()}
+    # Run inference
+    inputs = tokenizer([prompt], return_tensors="pt").to(args.device)
     output_ids = model.generate(
         **inputs,
         do_sample=True if args.temperature > 1e-5 else False,
@@ -52,6 +51,7 @@ def main(args):
         output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
     )
 
+    # Print results
     print(f"{conv.roles[0]}: {msg}")
     print(f"{conv.roles[1]}: {outputs}")
 
diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
index 750ff11ab..fd2810389 100644
--- a/fastchat/serve/inference.py
+++ b/fastchat/serve/inference.py
@@ -84,7 +84,8 @@ def generate_stream(
     echo = bool(params.get("echo", True))
     stop_str = params.get("stop", None)
     stop_token_ids = params.get("stop_token_ids", None) or []
-    stop_token_ids.append(tokenizer.eos_token_id)
+    if tokenizer.eos_token_id not in stop_token_ids:
+        stop_token_ids.append(tokenizer.eos_token_id)
 
     logits_processor = prepare_logits_processor(
         temperature, repetition_penalty, top_p, top_k
@@ -112,6 +113,7 @@ def generate_stream(
 
     past_key_values = out = None
     sent_interrupt = False
+    finish_reason = None
     for i in range(max_new_tokens):
         if i == 0:  # prefill
             if model.config.is_encoder_decoder:
@@ -244,12 +246,11 @@ def generate_stream(
             break
 
     # Finish stream event, which contains finish reason
-    if i == max_new_tokens - 1:
+    else:
         finish_reason = "length"
-    elif stopped:
+
+    if stopped:
         finish_reason = "stop"
-    else:
-        finish_reason = None
 
     yield {
         "text": output,
@@ -267,6 +268,8 @@ def generate_stream(
     torch.cuda.empty_cache()
     if device == "xpu":
         torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
 
 
 class ChatIO(abc.ABC):
@@ -292,6 +295,7 @@ def chat_loop(
     device: str,
     num_gpus: int,
     max_gpu_memory: str,
+    dtype: Optional[torch.dtype],
     load_8bit: bool,
     cpu_offloading: bool,
     conv_template: Optional[str],
@@ -313,6 +317,7 @@ def chat_loop(
         device=device,
         num_gpus=num_gpus,
         max_gpu_memory=max_gpu_memory,
+        dtype=dtype,
         load_8bit=load_8bit,
         cpu_offloading=cpu_offloading,
         gptq_config=gptq_config,
diff --git a/fastchat/serve/launch_all_serve.py b/fastchat/serve/launch_all_serve.py
index 7847f0064..1952cfb17 100644
--- a/fastchat/serve/launch_all_serve.py
+++ b/fastchat/serve/launch_all_serve.py
@@ -66,7 +66,7 @@
 parser.add_argument(
     "--device",
     type=str,
-    choices=["cpu", "cuda", "mps", "xpu"],
+    choices=["cpu", "cuda", "mps", "xpu", "npu"],
     default="cuda",
     help="The device type",
 )
diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py
index dac3764d4..54d51cfd0 100644
--- a/fastchat/serve/model_worker.py
+++ b/fastchat/serve/model_worker.py
@@ -34,6 +34,7 @@
     )
 import torch
 import torch.nn.functional as F
+from transformers import set_seed
 import uvicorn
 
 from fastchat.constants import WORKER_HEART_BEAT_INTERVAL, ErrorCode, SERVER_ERROR_MSG
@@ -46,7 +47,12 @@
 )
 from fastchat.modules.gptq import GptqConfig
 from fastchat.modules.awq import AWQConfig
-from fastchat.utils import build_logger, pretty_print_semaphore, get_context_length
+from fastchat.utils import (
+    build_logger,
+    pretty_print_semaphore,
+    get_context_length,
+    str_to_torch_dtype,
+)
 
 
 worker_id = str(uuid.uuid4())[:8]
@@ -190,13 +196,15 @@ def __init__(
         device: str,
         num_gpus: int,
         max_gpu_memory: str,
+        dtype: Optional[torch.dtype] = None,
         load_8bit: bool = False,
         cpu_offloading: bool = False,
         gptq_config: Optional[GptqConfig] = None,
         awq_config: Optional[AWQConfig] = None,
         stream_interval: int = 2,
-        conv_template: str = None,
+        conv_template: Optional[str] = None,
         embed_in_truncate: bool = False,
+        seed: Optional[int] = None,
         **kwargs,
     ):
         super().__init__(
@@ -215,6 +223,7 @@ def __init__(
             device=device,
             num_gpus=num_gpus,
             max_gpu_memory=max_gpu_memory,
+            dtype=dtype,
             load_8bit=load_8bit,
             cpu_offloading=cpu_offloading,
             gptq_config=gptq_config,
@@ -227,6 +236,7 @@ def __init__(
         self.generate_stream_func = get_generate_stream_function(self.model, model_path)
         self.stream_interval = stream_interval
         self.embed_in_truncate = embed_in_truncate
+        self.seed = seed
 
         if not no_register:
             self.init_heart_beat()
@@ -235,6 +245,8 @@ def generate_stream_gate(self, params):
         self.call_ct += 1
 
         try:
+            if self.seed is not None:
+                set_seed(self.seed)
             for output in self.generate_stream_func(
                 self.model,
                 self.tokenizer,
@@ -370,6 +382,8 @@ def get_embeddings(self, params):
             torch.cuda.empty_cache()
             if self.device == "xpu":
                 torch.xpu.empty_cache()
+            if self.device == "npu":
+                torch.npu.empty_cache()
         except torch.cuda.OutOfMemoryError as e:
             ret = {
                 "text": f"{SERVER_ERROR_MSG}\n\n({e})",
@@ -473,6 +487,12 @@ def create_model_worker():
     )
     parser.add_argument("--stream-interval", type=int, default=2)
     parser.add_argument("--no-register", action="store_true")
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Overwrite the random seed for each generation.",
+    )
     args = parser.parse_args()
     logger.info(f"args: {args}")
 
@@ -506,6 +526,7 @@ def create_model_worker():
         device=args.device,
         num_gpus=args.num_gpus,
         max_gpu_memory=args.max_gpu_memory,
+        dtype=str_to_torch_dtype(args.dtype),
         load_8bit=args.load_8bit,
         cpu_offloading=args.cpu_offloading,
         gptq_config=gptq_config,
@@ -513,6 +534,7 @@ def create_model_worker():
         stream_interval=args.stream_interval,
         conv_template=args.conv_template,
         embed_in_truncate=args.embed_in_truncate,
+        seed=args.seed,
     )
     return args, worker
 
diff --git a/fastchat/serve/monitor/basic_stats.py b/fastchat/serve/monitor/basic_stats.py
index b57e0913c..e1934bb07 100644
--- a/fastchat/serve/monitor/basic_stats.py
+++ b/fastchat/serve/monitor/basic_stats.py
@@ -17,7 +17,7 @@
 
 def get_log_files(max_num_files=None):
     dates = []
-    for month in range(4, 9):
+    for month in range(4, 12):
         for day in range(1, 33):
             dates.append(f"2023-{month:02d}-{day:02d}")
 
diff --git a/fastchat/serve/monitor/clean_battle_data.py b/fastchat/serve/monitor/clean_battle_data.py
index 63c8e565b..4cab1af42 100644
--- a/fastchat/serve/monitor/clean_battle_data.py
+++ b/fastchat/serve/monitor/clean_battle_data.py
@@ -34,6 +34,7 @@
     "palm",
     "lamda",
     "google",
+    "llama",
     "NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.",
 ]
 
@@ -43,11 +44,7 @@
 
 def get_log_files(max_num_files=None):
     dates = []
-    for month in [4, 5, 6, 7]:
-        for day in range(1, 32):
-            dates.append(f"2023-{month:02d}-{day:02d}")
-
-    for month in [8]:
+    for month in [4, 5, 6, 7, 8, 9]:
         for day in range(1, 32):
             dates.append(f"2023-{month:02d}-{day:02d}")
 
@@ -85,7 +82,7 @@ def replace_model_name(old_name):
     )
 
 
-def clean_battle_data(log_files):
+def clean_battle_data(log_files, exclude_model_names):
     data = []
     for filename in tqdm(log_files, desc="read files"):
         for retry in range(5):
@@ -173,6 +170,11 @@ def clean_battle_data(log_files):
         # Replace bard with palm
         models = [replace_model_name(m) for m in models]
 
+        # Exclude certain models
+        if any(x in exclude_model_names for x in models):
+            ct_invalid += 1
+            continue
+
         question_id = row["states"][0]["conv_id"]
         conversation_a = to_openai_format(
             row["states"][0]["messages"][row["states"][0]["offset"] :]
@@ -186,7 +188,7 @@ def clean_battle_data(log_files):
             all_ips[ip] = len(all_ips)
         user_id = all_ips[ip]
 
-        # Save the result
+        # Save the results
         battles.append(
             dict(
                 question_id=question_id,
@@ -228,10 +230,11 @@ def clean_battle_data(log_files):
     parser.add_argument(
         "--mode", type=str, choices=["simple", "conv_release"], default="simple"
     )
+    parser.add_argument("--exclude-model-names", type=str, nargs="+")
     args = parser.parse_args()
 
     log_files = get_log_files(args.max_num_files)
-    battles = clean_battle_data(log_files)
+    battles = clean_battle_data(log_files, args.exclude_model_names or [])
     last_updated_tstamp = battles[-1]["tstamp"]
     cutoff_date = datetime.datetime.fromtimestamp(
         last_updated_tstamp, tz=timezone("US/Pacific")
diff --git a/fastchat/serve/monitor/clean_chat_data.py b/fastchat/serve/monitor/clean_chat_data.py
index 76b4da50d..86d15bac2 100644
--- a/fastchat/serve/monitor/clean_chat_data.py
+++ b/fastchat/serve/monitor/clean_chat_data.py
@@ -28,11 +28,7 @@
 
 def get_log_files(max_num_files=None):
     dates = []
-    for month in [4, 5, 6, 7]:
-        for day in range(1, 32):
-            dates.append(f"2023-{month:02d}-{day:02d}")
-
-    for month in [8]:
+    for month in [4, 5, 6, 7, 8, 9, 10]:
         for day in range(1, 32):
             dates.append(f"2023-{month:02d}-{day:02d}")
 
@@ -48,7 +44,7 @@ def get_log_files(max_num_files=None):
     return filenames
 
 
-def clean_chat_data(log_files):
+def clean_chat_data(log_files, action_type):
     raw_data = []
     for filename in tqdm(log_files, desc="read files"):
         for retry in range(5):
@@ -60,7 +56,7 @@ def clean_chat_data(log_files):
 
         for l in lines:
             row = json.loads(l)
-            if row["type"] == "chat":
+            if row["type"] == action_type:
                 raw_data.append(row)
 
     all_models = set()
@@ -70,18 +66,26 @@ def clean_chat_data(log_files):
     ct_invalid = 0
     ct_network_error = 0
     for row in raw_data:
-        if "conv_id" not in row["state"]:
+        try:
+            if action_type in ["chat", "upvote", "downvote"]:
+                state = row["state"]
+                model = row["model"]
+            elif action_type == "leftvote":
+                state = row["states"][0]
+                model = row["states"][0]["model_name"]
+            elif action_type == "rightvote":
+                state = row["states"][1]
+                model = row["states"][1]["model_name"]
+            conversation_id = state["conv_id"]
+        except KeyError:
             ct_invalid_conv_id += 1
             continue
 
-        conversation_id = row["state"]["conv_id"]
         if conversation_id is None:
             ct_invalid_conv_id += 1
             continue
 
-        state = row["state"]
         conversation = to_openai_format(state["messages"][state["offset"] :])
-        model = row["model"]
         if not isinstance(model, str):
             ct_invalid += 1
             continue
@@ -150,17 +154,18 @@ def clean_chat_data(log_files):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--action-type", type=str, default="chat")
     parser.add_argument("--max-num-files", type=int)
     args = parser.parse_args()
 
     log_files = get_log_files(args.max_num_files)
-    chats = clean_chat_data(log_files)
+    chats = clean_chat_data(log_files, args.action_type)
     last_updated_tstamp = chats[-1]["tstamp"]
     cutoff_date = datetime.datetime.fromtimestamp(
         last_updated_tstamp, tz=timezone("US/Pacific")
     ).strftime("%Y%m%d")
 
-    output = f"clean_chat_conv_{cutoff_date}.json"
+    output = f"clean_{args.action_type}_conv_{cutoff_date}.json"
     with open(output, "w") as fout:
         json.dump(chats, fout, indent=2, ensure_ascii=False)
     print(f"Write cleaned data to {output}")
diff --git a/fastchat/serve/monitor/conv_release_scripts/count_unique_users.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/count_unique_users.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/count_unique_users.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/count_unique_users.py
diff --git a/fastchat/serve/monitor/conv_release_scripts/filter_bad_conv.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/filter_bad_conv.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/filter_bad_conv.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/filter_bad_conv.py
diff --git a/fastchat/serve/monitor/conv_release_scripts/merge_field.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/merge_field.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/merge_field.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/merge_field.py
diff --git a/fastchat/serve/monitor/conv_release_scripts/sample.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/sample.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/sample.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/sample.py
diff --git a/fastchat/serve/monitor/conv_release_scripts/upload_hf_dataset.py b/fastchat/serve/monitor/dataset_release_scripts/arena_33k/upload_hf_dataset.py
similarity index 100%
rename from fastchat/serve/monitor/conv_release_scripts/upload_hf_dataset.py
rename to fastchat/serve/monitor/dataset_release_scripts/arena_33k/upload_hf_dataset.py
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py
new file mode 100644
index 000000000..97abaaa0d
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py
@@ -0,0 +1,119 @@
+"""
+From colab:
+https://colab.research.google.com/drive/1oMdw_Lqgmd6DletSOLHsyD-Rc96cRShs?usp=sharing
+"""
+import argparse
+import datetime
+import json
+import os
+from pytz import timezone
+import time
+
+import kaleido
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from tqdm import tqdm
+
+import plotly.io as pio
+
+pio.kaleido.scope.mathjax = None
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--in-file", type=str, required=True)
+parser.add_argument("--scale", type=int, required=True)
+args = parser.parse_args()
+
+filename = args.in_file
+scale = args.scale
+convs = json.load(open(filename))
+df = pd.DataFrame(convs)
+df
+
+print(f"#ips: {df['user_id'].nunique() * scale}")
+print(f"#models: {df['model'].nunique()}")
+print(f"#language: {df['language'].nunique()}")
+print(f"#turns: {df['turn'].mean()}")
+
+model_counts = df["model"].value_counts() * scale
+# print("model counts", model_counts)
+fig = px.bar(x=model_counts.index, y=model_counts)
+fig.update_layout(
+    xaxis_title=None,
+    yaxis_title="Count",
+    height=200,
+    width=950,
+    margin=dict(l=0, r=0, t=0, b=0),
+)
+fig.show()
+fig.write_image("model_count.pdf")
+
+
+model_counts = df["language"].value_counts().head(25) * scale
+fig = px.bar(x=model_counts.index, y=model_counts)
+fig.update_layout(
+    xaxis_title=None,
+    yaxis_title="Count",
+    height=200,
+    width=950,
+    margin=dict(l=0, r=0, t=0, b=0),
+)
+fig.show()
+fig.write_image("language_count.pdf")
+
+chat_dates = [
+    datetime.datetime.fromtimestamp(x, tz=timezone("US/Pacific")).strftime("%Y-%m-%d")
+    for x in df["tstamp"]
+]
+
+
+def to_remove(x):
+    for d in ["08-09", "08-08", "08-07", "08-06", "08-05", "08-04"]:
+        if d in x:
+            return True
+    return False
+
+
+chat_dates = [x for x in chat_dates if not to_remove(x)]
+
+chat_dates_counts = pd.value_counts(chat_dates) * scale
+print(f"mean #chat per day: {np.mean(chat_dates_counts):.2f}")
+
+fig = px.bar(x=chat_dates_counts.index, y=chat_dates_counts)
+fig.update_layout(
+    xaxis_title="Dates",
+    yaxis_title="Count",
+    height=200,
+    width=950,
+    margin=dict(l=0, r=0, t=0, b=0),
+)
+fig.show()
+fig.write_image("daily_conversation_count.pdf")
+
+import transformers
+
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    "lmsys/vicuna-7b-v1.5", use_fast=False
+)
+
+prompts = []
+responses = []
+for conv in df["conversation"]:
+    for row in conv:
+        if row["role"] == "user":
+            prompts.append(row["content"])
+        else:
+            responses.append(row["content"])
+
+print(f"#prompts: {len(prompts)}")
+print(f"#responses: {len(responses)}")
+
+
+prompt_lens = [len(tokenizer(x).input_ids) for x in tqdm(prompts)]
+print()
+print(f"mean prompt len: {np.mean(prompt_lens):.2f}")
+
+response_lens = [len(tokenizer(x).input_ids) if x else 0 for x in tqdm(responses)]
+print()
+print(f"mean response len: {np.mean(response_lens):.2f}")
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py
new file mode 100644
index 000000000..3ccde1ca5
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/filter_bad_conv.py
@@ -0,0 +1,148 @@
+"""
+Filter conversations for release.
+
+Dependency:
+pip install opencc-python-reimplementedpip install opencc-python-reimplemented
+
+Usage:
+python3 filter_bad_conv_lmsys_chat_1m.py --in clean_battle_conv_20230630_tagged_v1_pii.json
+"""
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+from collections import defaultdict
+from enum import Enum, auto
+import json
+import os
+import random
+
+from tqdm import tqdm
+import opencc
+
+BLOCKED_WORDS_FILENAME = "blocked_words.json"
+blocked_words = []
+frequency = defaultdict(lambda: 0)
+
+cc_converter = opencc.OpenCC("t2s")
+
+
+class TypeCode(Enum):
+    CORRECT = auto()
+    ANONYMIZED = auto()
+    REDACTED = auto()
+    BAD_FORMAT = auto()
+    BLOCKED_WORD = auto()
+    BLOCKED_MODEL = auto()
+    TOO_SHORT = auto()
+    TOO_FREQUENT = auto()
+
+
+def detect_type(conv):
+    for key in ["conversation_a", "conversation_b", "conversation"]:
+        if key not in conv:
+            continue
+
+        messages = [row["content"] for row in conv[key]]
+        for msg in messages:
+            if not isinstance(msg, str):
+                return TypeCode.BAD_FORMAT
+
+        if len(messages) == 0:
+            return TypeCode.BAD_FORMAT
+
+        user_prompts = [
+            row["content"].lower().strip() for row in conv[key] if row["role"] == "user"
+        ]
+
+        for msg in messages:
+            msg = cc_converter.convert(msg.lower())
+            if "<anonymized>" in msg:
+                return TypeCode.ANONYMIZED
+            if "<redacted>" in msg:
+                return TypeCode.REDACTED
+
+            for w in blocked_words:
+                if w in msg:
+                    return TypeCode.BLOCKED_WORD
+
+    return TypeCode.CORRECT
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--sample", type=int)
+    args = parser.parse_args()
+
+    # Read conversations
+    convs = json.load(open(args.in_file))
+    print(f"#conv: {len(convs)}")
+
+    # Read blocked words
+    if os.path.exists(BLOCKED_WORDS_FILENAME):
+        blocked_words = json.load(open(BLOCKED_WORDS_FILENAME))
+        blocked_words = [cc_converter.convert(w) for w in blocked_words]
+
+    # Start filter
+    ct_bad_format = 0
+    ct_anonymized = 0
+    ct_redacted = 0
+    ct_error = 0
+    ct_lang_filter = 0
+    ct_flagged = 0
+    ct_blocked_word = 0
+    ct_blocked_model = 0
+    ct_too_short = 0
+    ct_too_frequent = 0
+
+    type_codes = []
+    with ProcessPoolExecutor() as executor:
+        for result in tqdm(executor.map(detect_type, convs), total=len(convs)):
+            type_codes.append(result)
+
+    new_convs = []
+    for conv, type_code in zip(convs, type_codes):
+        if type_code == TypeCode.BAD_FORMAT:
+            ct_bad_format += 1
+            continue
+
+        if type_code == TypeCode.ANONYMIZED:
+            ct_anonymized += 1
+            continue
+        elif type_code == TypeCode.REDACTED:
+            ct_redacted += 1
+            continue
+        elif type_code == TypeCode.BLOCKED_WORD:
+            ct_blocked_word += 1
+            continue
+        elif type_code == TypeCode.BLOCKED_MODEL:
+            ct_blocked_model += 1
+            continue
+        elif type_code == TypeCode.TOO_SHORT:
+            ct_too_short += 1
+            continue
+        elif type_code == TypeCode.TOO_FREQUENT:
+            ct_too_frequent += 1
+            continue
+
+        if "openai_moderation" in conv and conv["openai_moderation"]["flagged"]:
+            ct_flagged += 1
+            continue
+
+        if type_code in [TypeCode.CORRECT]:
+            new_convs.append(conv)
+
+    if args.sample:
+        random.seed(42)
+        random.shuffle(new_convs)
+        new_convs = new_convs[: args.sample]
+
+    print(f"ct_anonymized: {ct_anonymized}, ct_redacted: {ct_redacted}")
+    print(f"ct_bad_format: {ct_bad_format}, ct_flagged: {ct_flagged}")
+    print(f"ct_blocked_word: {ct_blocked_word}, ct_blocked_model: {ct_blocked_model}")
+    print(f"ct_too_short: {ct_too_short}, ct_too_frequent: {ct_too_frequent}")
+    print(f"new_conv: {len(new_convs)}")
+
+    out_file = args.in_file.replace(".json", ".s1.json")
+    print(f"Output to {out_file}")
+    with open(out_file, "w") as fout:
+        json.dump(new_convs, fout, indent=2, ensure_ascii=False)
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py
new file mode 100644
index 000000000..e368e92a1
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py
@@ -0,0 +1,27 @@
+import argparse
+import json
+
+from tqdm import tqdm
+import numpy as np
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    args = parser.parse_args()
+
+    # Read conversations
+    convs = json.load(open(args.in_file))
+    print(f"#conv: {len(convs)}")
+
+    # Delete some fileds
+    for c in convs:
+        del c["tstamp"]
+        del c["user_id"]
+
+    # Write
+    print(f"#out conv: {len(convs)}")
+    out_file = args.in_file.replace(".json", ".s2.json")
+    print(f"Output to {out_file}")
+    with open(out_file, "w") as fout:
+        json.dump(convs, fout, indent=2, ensure_ascii=False)
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/instructions.md b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/instructions.md
new file mode 100644
index 000000000..4c439731f
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/instructions.md
@@ -0,0 +1,23 @@
+```
+export BASE=clean_conv_20230809_100k_pii
+export SCALE=10
+
+# filter words
+python3 filter_bad_conv.py --in $BASE.json
+
+# Clean up some fileds (e.g., timestamps)
+python3 final_post_processing.py --in $BASE.s1.json
+
+# upload to hf
+python3 upload_hf_dataset.py --in $BASE.s1.s2.json
+
+# Make another version with openai moderation tag
+python3 merge_oai_tag.py --in $BASE.s1.s2.json
+
+# Make visualizations
+python3 compute_stats.py --in $BASE.s1.json --scale $SCALE
+
+# Copy figures
+scp "atlas:/data/lmzheng/FastChat/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/*.pdf" .
+```
+
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/merge_oai_tag.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/merge_oai_tag.py
new file mode 100644
index 000000000..18bef5f19
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/merge_oai_tag.py
@@ -0,0 +1,45 @@
+import argparse
+import json
+import time
+
+from tqdm import tqdm
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--sample", type=int)
+    args = parser.parse_args()
+
+    tag_file = "clean_conv_20230809_1.5M_oai_filter_v2.json"
+    # tag_file = "clean_conv_20230809_1.5M_oai_filter_v2_100k.json"
+    in_file = args.in_file
+    tic = time.time()
+
+    # Load tags
+    print("Load tags...")
+    tag_data = json.load(open(tag_file))
+    tag_dict = {}
+    for c in tqdm(tag_data):
+        tag_dict[c["conversation_id"]] = [x["oai_filter"] for x in c["conversation"]]
+    print(f"elapsed: {time.time() - tic:.2f} s")
+
+    # Append to input_file
+    print("Load inputs...")
+    input_data = json.load(open(in_file))
+    for c in tqdm(input_data):
+        cid = c["conversation_id"]
+        if cid in tag_dict:
+            c["openai_moderation"] = tag_dict[cid]
+        else:
+            print(f"missing tag for conv {cid}")
+            exit()
+    print(f"elapsed: {time.time() - tic:.2f} s")
+
+    # Write output
+    print("Write outputs...")
+    out_file = in_file.replace(".json", ".with_tag.json")
+    print(f"Output to {out_file}")
+    with open(out_file, "w") as fout:
+        json.dump(input_data, fout, indent=2, ensure_ascii=False)
+    print(f"elapsed: {time.time() - tic:.2f} s")
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh
new file mode 100644
index 000000000..5bae9fbad
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh
@@ -0,0 +1,18 @@
+export BASE=clean_conv_20230809_1.5M_pii
+#export BASE=clean_conv_20230809_100k_pii
+export SCALE=1
+
+# Filter words
+python3 filter_bad_conv.py --in $BASE.json --sample 1000000
+
+# Clean up some fileds (e.g., timestamps)
+python3 final_post_processing.py --in $BASE.s1.json
+
+# Upload to hf
+python3 upload_hf_dataset.py --in $BASE.s1.s2.json
+
+# Make another version with openai moderation tag
+python3 merge_oai_tag.py --in $BASE.s1.s2.json
+
+# Make visualizations
+python3 compute_stats.py --in $BASE.s1.json --scale $SCALE
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/sample.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/sample.py
new file mode 100644
index 000000000..3b6da455f
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/sample.py
@@ -0,0 +1,32 @@
+"""
+Count the unique users in a battle log file.
+
+Usage:
+python3 -input in.json --number 1000
+"""
+
+import argparse
+import json
+import random
+
+K = 1000
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str)
+    parser.add_argument("--number", type=int, nargs="+")
+    args = parser.parse_args()
+
+    convs = json.load(open(args.input))
+    random.seed(42)
+    random.shuffle(convs)
+
+    for number in args.number:
+        new_convs = convs[:number]
+
+        output = args.input.replace(".json", f"_{number//K}k.json")
+        with open(output, "w") as fout:
+            json.dump(new_convs, fout, indent=2, ensure_ascii=False)
+
+        print(f"#in: {len(convs)}, #out: {len(new_convs)}")
+        print(f"Write to file: {output}")
diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/upload_hf_dataset.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/upload_hf_dataset.py
new file mode 100644
index 000000000..41d0fbdb5
--- /dev/null
+++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/upload_hf_dataset.py
@@ -0,0 +1,17 @@
+"""
+Upload to huggingface.
+"""
+import argparse
+import json
+from datasets import Dataset, DatasetDict, load_dataset
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    args = parser.parse_args()
+
+    objs = json.load(open(args.in_file))
+    print(f"#convs: {len(objs)}")
+    data = Dataset.from_list(objs)
+    data.push_to_hub("lmsys/lmsys-chat-1m", private=True)
diff --git a/fastchat/serve/monitor/hf_space_leaderboard_app.py b/fastchat/serve/monitor/hf_space_leaderboard_app.py
deleted file mode 100644
index 8fb21fbdc..000000000
--- a/fastchat/serve/monitor/hf_space_leaderboard_app.py
+++ /dev/null
@@ -1,258 +0,0 @@
-"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
-import ast
-import argparse
-import pickle
-
-import gradio as gr
-import numpy as np
-
-
-notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
-
-
-basic_component_values = [None] * 6
-leader_component_values = [None] * 5
-
-
-def make_leaderboard_md(elo_results):
-    leaderboard_md = f"""
-# Leaderboard
-| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
-
-🏆 This leaderboard is based on the following three benchmarks.
-- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 40K+ user votes to compute Elo ratings.
-- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
-- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
-
-💻 We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10) and win rates (against gpt-3.5). The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
-"""
-    return leaderboard_md
-
-
-def make_leaderboard_md_live(elo_results):
-    leaderboard_md = f"""
-# Leaderboard
-Last updated: {elo_results["last_updated_datetime"]}
-{elo_results["leaderboard_table"]}
-"""
-    return leaderboard_md
-
-
-def update_elo_components(max_num_files, elo_results_file):
-    log_files = get_log_files(max_num_files)
-
-    # Leaderboard
-    if elo_results_file is None:  # Do live update
-        battles = clean_battle_data(log_files)
-        elo_results = report_elo_analysis_results(battles)
-
-        leader_component_values[0] = make_leaderboard_md_live(elo_results)
-        leader_component_values[1] = elo_results["win_fraction_heatmap"]
-        leader_component_values[2] = elo_results["battle_count_heatmap"]
-        leader_component_values[3] = elo_results["bootstrap_elo_rating"]
-        leader_component_values[4] = elo_results["average_win_rate_bar"]
-
-    # Basic stats
-    basic_stats = report_basic_stats(log_files)
-    md0 = f"Last updated: {basic_stats['last_updated_datetime']}"
-
-    md1 = "### Action Histogram\n"
-    md1 += basic_stats["action_hist_md"] + "\n"
-
-    md2 = "### Anony. Vote Histogram\n"
-    md2 += basic_stats["anony_vote_hist_md"] + "\n"
-
-    md3 = "### Model Call Histogram\n"
-    md3 += basic_stats["model_hist_md"] + "\n"
-
-    md4 = "### Model Call (Last 24 Hours)\n"
-    md4 += basic_stats["num_chats_last_24_hours"] + "\n"
-
-    basic_component_values[0] = md0
-    basic_component_values[1] = basic_stats["chat_dates_bar"]
-    basic_component_values[2] = md1
-    basic_component_values[3] = md2
-    basic_component_values[4] = md3
-    basic_component_values[5] = md4
-
-
-def update_worker(max_num_files, interval, elo_results_file):
-    while True:
-        tic = time.time()
-        update_elo_components(max_num_files, elo_results_file)
-        durtaion = time.time() - tic
-        print(f"update duration: {durtaion:.2f} s")
-        time.sleep(max(interval - durtaion, 0))
-
-
-def load_demo(url_params, request: gr.Request):
-    logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
-    return basic_component_values + leader_component_values
-
-
-def model_hyperlink(model_name, link):
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-
-
-def load_leaderboard_table_csv(filename, add_hyperlink=True):
-    lines = open(filename).readlines()
-    heads = [v.strip() for v in lines[0].split(",")]
-    rows = []
-    for i in range(1, len(lines)):
-        row = [v.strip() for v in lines[i].split(",")]
-        for j in range(len(heads)):
-            item = {}
-            for h, v in zip(heads, row):
-                if h == "Arena Elo rating":
-                    if v != "-":
-                        v = int(ast.literal_eval(v))
-                    else:
-                        v = np.nan
-                elif h == "MMLU":
-                    if v != "-":
-                        v = round(ast.literal_eval(v) * 100, 1)
-                    else:
-                        v = np.nan
-                elif h == "MT-bench (win rate %)":
-                    if v != "-":
-                        v = round(ast.literal_eval(v[:-1]), 1)
-                    else:
-                        v = np.nan
-                elif h == "MT-bench (score)":
-                    if v != "-":
-                        v = round(ast.literal_eval(v), 2)
-                    else:
-                        v = np.nan
-                item[h] = v
-            if add_hyperlink:
-                item["Model"] = model_hyperlink(item["Model"], item["Link"])
-        rows.append(item)
-
-    return rows
-
-
-def build_basic_stats_tab():
-    empty = "Loading ..."
-    basic_component_values[:] = [empty, None, empty, empty, empty, empty]
-
-    md0 = gr.Markdown(empty)
-    gr.Markdown("#### Figure 1: Number of model calls and votes")
-    plot_1 = gr.Plot(show_label=False)
-    with gr.Row():
-        with gr.Column():
-            md1 = gr.Markdown(empty)
-        with gr.Column():
-            md2 = gr.Markdown(empty)
-    with gr.Row():
-        with gr.Column():
-            md3 = gr.Markdown(empty)
-        with gr.Column():
-            md4 = gr.Markdown(empty)
-    return [md0, plot_1, md1, md2, md3, md4]
-
-
-def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
-    if elo_results_file is None:  # Do live update
-        md = "Loading ..."
-        p1 = p2 = p3 = p4 = None
-    else:
-        with open(elo_results_file, "rb") as fin:
-            elo_results = pickle.load(fin)
-
-        md = make_leaderboard_md(elo_results)
-        p1 = elo_results["win_fraction_heatmap"]
-        p2 = elo_results["battle_count_heatmap"]
-        p3 = elo_results["bootstrap_elo_rating"]
-        p4 = elo_results["average_win_rate_bar"]
-
-    md_1 = gr.Markdown(md, elem_id="leaderboard_markdown")
-
-    if leaderboard_table_file:
-        data = load_leaderboard_table_csv(leaderboard_table_file)
-        headers = [
-            "Model",
-            "Arena Elo rating",
-            "MT-bench (score)",
-            "MT-bench (win rate %)",
-            "MMLU",
-            "License",
-        ]
-        values = []
-        for item in data:
-            row = []
-            for key in headers:
-                value = item[key]
-                row.append(value)
-            values.append(row)
-        values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
-
-        headers[1] = "⭐ " + headers[1]
-        headers[2] = "📈 " + headers[2]
-
-        gr.Dataframe(
-            headers=headers,
-            datatype=["markdown", "number", "number", "number", "number", "str"],
-            value=values,
-            elem_id="leaderboard_dataframe",
-        )
-        gr.Markdown(
-            "If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model)."
-        )
-    else:
-        pass
-
-    gr.Markdown(
-        f"""## More Statistics for Chatbot Arena\n
-We added some additional figures to show more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
-Please note that you may see different orders from different ranking methods. This is expected for models that perform similarly, as demonstrated by the confidence interval in the bootstrap figure. Going forward, we prefer the classical Elo calculation because of its scalability and interpretability. You can find more discussions in this blog [post](https://lmsys.org/blog/2023-05-03-arena/).
-"""
-    )
-
-    leader_component_values[:] = [md, p1, p2, p3, p4]
-
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown(
-                "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
-            )
-            plot_1 = gr.Plot(p1, show_label=False)
-        with gr.Column():
-            gr.Markdown(
-                "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
-            )
-            plot_2 = gr.Plot(p2, show_label=False)
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown(
-                "#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
-            )
-            plot_3 = gr.Plot(p3, show_label=False)
-        with gr.Column():
-            gr.Markdown(
-                "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
-            )
-            plot_4 = gr.Plot(p4, show_label=False)
-    return [md_1, plot_1, plot_2, plot_3, plot_4]
-
-
-def build_demo(elo_results_file, leaderboard_table_file):
-    text_size = gr.themes.sizes.text_lg
-
-    with gr.Blocks(
-        title="Chatbot Arena Leaderboard",
-        theme=gr.themes.Base(text_size=text_size),
-    ) as demo:
-        leader_components = build_leaderboard_tab(
-            elo_results_file, leaderboard_table_file
-        )
-
-    return demo
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--share", action="store_true")
-    args = parser.parse_args()
-
-    demo = build_demo("elo_results_20230619.pkl", "leaderboard_table_20230619.csv")
-    demo.launch(share=args.share)
diff --git a/fastchat/serve/monitor/intersect_conv_file.py b/fastchat/serve/monitor/intersect_conv_file.py
new file mode 100644
index 000000000..9eadd7cd5
--- /dev/null
+++ b/fastchat/serve/monitor/intersect_conv_file.py
@@ -0,0 +1,25 @@
+"""
+Take the intersection of two conversation files.
+
+Usage: python3 -m fastchat.data.merge --input input.json --conv-id conv_id_file.json --out intersect.json
+"""
+
+import argparse
+import json
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str, required=True)
+    parser.add_argument("--conv-id", type=str, required=True)
+    parser.add_argument("--out-file", type=str, default="intersect.json")
+    args = parser.parse_args()
+
+    conv_id_objs = json.load(open(args.conv_id, "r"))
+    conv_ids = set(x["conversation_id"] for x in conv_id_objs)
+
+    objs = json.load(open(args.input, "r"))
+    after_objs = [x for x in objs if x["conversation_id"] in conv_ids]
+
+    print(f"#in: {len(objs)}, #out: {len(after_objs)}")
+    json.dump(after_objs, open(args.out_file, "w"), indent=2, ensure_ascii=False)
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
index 395f2bf84..5efe8486c 100644
--- a/fastchat/serve/monitor/monitor.py
+++ b/fastchat/serve/monitor/monitor.py
@@ -1,5 +1,10 @@
-# sudo apt install pkg-config libicu-dev
-# pip install pytz gradio gdown plotly polyglot pyicu pycld2 tabulate
+"""
+Live monitor of the website statistics and leaderboard.
+
+Dependency:
+sudo apt install pkg-config libicu-dev
+pip install pytz gradio gdown plotly polyglot pyicu pycld2 tabulate
+"""
 
 import argparse
 import ast
@@ -30,11 +35,11 @@ def make_leaderboard_md(elo_results):
 | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://huggingface.co/datasets/lmsys/chatbot_arena_conversations) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 
 🏆 This leaderboard is based on the following three benchmarks.
-- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 50K+ user votes to compute Elo ratings.
+- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 70K+ user votes to compute Elo ratings.
 - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
 - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
 
-💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
+💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023.
 """
     return leaderboard_md
 
@@ -53,7 +58,7 @@ def update_elo_components(max_num_files, elo_results_file):
 
     # Leaderboard
     if elo_results_file is None:  # Do live update
-        battles = clean_battle_data(log_files)
+        battles = clean_battle_data(log_files, [])
         elo_results = report_elo_analysis_results(battles)
 
         leader_component_values[0] = make_leaderboard_md_live(elo_results)
@@ -241,15 +246,23 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
                 "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
             )
             plot_4 = gr.Plot(p4, show_label=False)
+
+    from fastchat.serve.gradio_web_server import acknowledgment_md
+
+    gr.Markdown(acknowledgment_md)
+
     return [md_1, plot_1, plot_2, plot_3, plot_4]
 
 
 def build_demo(elo_results_file, leaderboard_table_file):
+    from fastchat.serve.gradio_web_server import block_css
+
     text_size = gr.themes.sizes.text_lg
 
     with gr.Blocks(
         title="Monitor",
         theme=gr.themes.Base(text_size=text_size),
+        css=block_css,
     ) as demo:
         with gr.Tabs() as tabs:
             with gr.Tab("Leaderboard", id=0):
diff --git a/fastchat/serve/monitor/replace_model_name.py b/fastchat/serve/monitor/replace_model_name.py
deleted file mode 100644
index ff2667e2f..000000000
--- a/fastchat/serve/monitor/replace_model_name.py
+++ /dev/null
@@ -1,21 +0,0 @@
-"""
-Usage:
-python3 replace_model_name.py --in clean_conv_20230809_10k.json
-"""
-
-import argparse
-import json
-
-from fastchat.serve.monitor.clean_battle_data import replace_model_name
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--in-file", type=str, required=True)
-    args = parser.parse_args()
-
-    convs = json.load(open(args.in_file))
-    for x in convs:
-        x["model"] = replace_model_name(x["model"])
-
-    with open(args.in_file, "w") as fout:
-        json.dump(convs, fout, indent=2, ensure_ascii=False)
diff --git a/fastchat/serve/monitor/summarize_cluster.py b/fastchat/serve/monitor/summarize_cluster.py
index 4ca7f48d2..1d5fbcddc 100644
--- a/fastchat/serve/monitor/summarize_cluster.py
+++ b/fastchat/serve/monitor/summarize_cluster.py
@@ -1,13 +1,14 @@
 """
-
 Usage:
-python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4
+python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4 --num-prompts 100
+python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model azure-gpt-4-32k --num-prompts 200
 """
 import argparse
 import pickle
 
 from fastchat.llm_judge.common import (
     chat_compeletion_openai,
+    chat_compeletion_openai_azure,
     chat_compeletion_anthropic,
 )
 from fastchat.conversation import get_conv_template
@@ -33,18 +34,26 @@ def truncate_string(s, l):
     topics = []
     percentages = []
     for i, info in enumerate(cluster_infos):
-        num_samples, prompts = info
+        num_samples, topk_prompts, random_prompts = info
         percentage = num_samples / num_total_prompts
         print(
             f"cluster {i}, #prompts {num_samples}, percentage: {percentage * 100:.2f}%"
         )
         instruct = "Given a list of user messages, use less than 8 words to summarize a central topic for all messages in English. Your output should only include a single line. Try to be specific."
+        split = int(args.num_prompts * 0.8)
         prompt = "\n".join(
-            [truncate_string(x, l=200) for x in prompts[: args.num_prompts]]
+            [truncate_string(x, l=200) for x in topk_prompts[:split]]
+            + [
+                truncate_string(x, l=200)
+                for x in random_prompts[: args.num_prompts - split]
+            ]
         )
         prompt = "BEGIN OF THE MESSAGE LIST\n" + prompt + "\nEND OF THE MESSAGE LIST."
 
-        if "gpt" in model:
+        if "azure-" in model:
+            template_name = "chatgpt"
+            completion_func = chat_compeletion_openai_azure
+        elif "gpt" in model:
             template_name = "chatgpt"
             completion_func = chat_compeletion_openai
         elif "claude" in model:
diff --git a/fastchat/serve/monitor/topic_clustering.py b/fastchat/serve/monitor/topic_clustering.py
index fcc45b623..dd15c6edc 100644
--- a/fastchat/serve/monitor/topic_clustering.py
+++ b/fastchat/serve/monitor/topic_clustering.py
@@ -2,7 +2,7 @@
 
 Usage:
 python3 topic_clustering.py --in arena.json --english-only --min-length 32
-python3 topic_clustering.py --in clean_conv_20230809_100k.json --english-only --min-length 32 --max-length 1024
+python3 topic_clustering.py --in clean_conv_20230809_100k.json --english-only --min-length 32 --max-length 1536
 """
 import argparse
 import json
@@ -90,7 +90,7 @@ def get_embeddings(texts, model_name, batch_size):
 
 
 def run_k_means(embeddings, num_clusters):
-    np.random.seed(0)
+    np.random.seed(42)
     clustering_model = KMeans(n_clusters=num_clusters, n_init="auto")
     clustering_model.fit(embeddings.numpy())
     centers = torch.from_numpy(clustering_model.cluster_centers_)
@@ -109,7 +109,7 @@ def run_k_means(embeddings, num_clusters):
 
 
 def run_agg_cluster(embeddings, num_clusters):
-    np.random.seed(0)
+    np.random.seed(42)
     clustering_model = AgglomerativeClustering(n_clusters=num_clusters)
     clustering_model.fit(embeddings)
     labels = torch.from_numpy(clustering_model.labels_)
@@ -124,7 +124,30 @@ def run_agg_cluster(embeddings, num_clusters):
 
     # Compute centers
     centers = []
-    for i in range(clustering_model.n_clusters_):
+    for i in range(len(classes)):
+        centers.append(embeddings[new_labels == i].mean(axis=0, keepdim=True))
+    centers = torch.cat(centers)
+    return centers, new_labels
+
+
+def run_hdbscan_cluster(embeddings):
+    import hdbscan
+
+    np.random.seed(42)
+    clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
+    labels = torch.from_numpy(clusterer.fit_predict(embeddings))
+
+    # Sort labels
+    classes, counts = np.unique(labels, return_counts=True)
+    indices = np.argsort(counts)[::-1]
+    classes = [classes[i] for i in indices]
+    new_labels = torch.empty_like(labels)
+    for i, c in enumerate(classes):
+        new_labels[labels == c] = i
+
+    # Compute centers
+    centers = []
+    for i in range(len(classes)):
         centers.append(embeddings[new_labels == i].mean(axis=0, keepdim=True))
     centers = torch.cat(centers)
     return centers, new_labels
@@ -160,13 +183,18 @@ def print_topk(texts, labels, topk_indices, show_cut_off):
 
 
 def get_cluster_info(texts, labels, topk_indices):
+    np.random.seed(42)
+
     cluster_info = []
     for k in range(len(topk_indices)):
         num_samples = torch.sum(labels == k).item()
-        prompts = []
+        topk_prompts = []
         for idx in topk_indices[k]:
-            prompts.append(texts[idx])
-        cluster_info.append((num_samples, prompts))
+            topk_prompts.append(texts[idx])
+        random_prompts = []
+        for idx in range(len(topk_indices)):
+            random_prompts.append(np.random.choice(texts))
+        cluster_info.append((num_samples, topk_prompts, random_prompts))
 
     return cluster_info
 
@@ -183,7 +211,10 @@ def get_cluster_info(texts, labels, topk_indices):
     parser.add_argument("--english-only", action="store_true")
     parser.add_argument("--num-clusters", type=int, default=20)
     parser.add_argument(
-        "--cluster-alg", type=str, choices=["kmeans", "aggcls"], default="kmeans"
+        "--cluster-alg",
+        type=str,
+        choices=["kmeans", "aggcls", "HDBSCAN"],
+        default="kmeans",
     )
     parser.add_argument("--show-top-k", type=int, default=200)
     parser.add_argument("--show-cut-off", type=int, default=512)
@@ -203,6 +234,8 @@ def get_cluster_info(texts, labels, topk_indices):
         centers, labels = run_k_means(embeddings, num_clusters)
     elif args.cluster_alg == "aggcls":
         centers, labels = run_agg_cluster(embeddings, num_clusters)
+    elif args.cluster_alg == "HDBSCAN":
+        centers, labels = run_hdbscan_cluster(embeddings)
     else:
         raise ValueError(f"Invalid clustering algorithm: {args.cluster_alg}")
 
@@ -210,8 +243,6 @@ def get_cluster_info(texts, labels, topk_indices):
     topk_str = print_topk(texts, labels, topk_indices, args.show_cut_off)
     num_clusters = len(centers)
 
-    cluster_info = get_cluster_info(texts, labels, topk_indices)
-
     # Dump results
     filename_prefix = f"results_c{num_clusters}_{args.cluster_alg}"
     print(topk_str)
@@ -231,5 +262,6 @@ def get_cluster_info(texts, labels, topk_indices):
                 obj = {"cluster": i, "text": text, "sim": score.item()}
                 fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
 
+    cluster_info = get_cluster_info(texts, labels, topk_indices)
     with open(filename_prefix + "_cluster.pkl", "wb") as fout:
         pickle.dump(cluster_info, fout)
diff --git a/fastchat/serve/multi_model_worker.py b/fastchat/serve/multi_model_worker.py
index 098c6d11e..13872bbdd 100644
--- a/fastchat/serve/multi_model_worker.py
+++ b/fastchat/serve/multi_model_worker.py
@@ -178,6 +178,13 @@ def create_multi_model_worker():
         action="append",
         help="One or more model names.  Values must be aligned with `--model-path` values.",
     )
+    parser.add_argument(
+        "--conv-template",
+        type=str,
+        default=None,
+        action="append",
+        help="Conversation prompt template. Values must be aligned with `--model-path` values. If only one value is provided, it will be repeated for all models.",
+    )
     parser.add_argument("--limit-worker-concurrency", type=int, default=5)
     parser.add_argument("--stream-interval", type=int, default=2)
     parser.add_argument("--no-register", action="store_true")
@@ -201,9 +208,16 @@ def create_multi_model_worker():
     if args.model_names is None:
         args.model_names = [[x.split("/")[-1]] for x in args.model_path]
 
+    if args.conv_template is None:
+        args.conv_template = [None] * len(args.model_path)
+    elif len(args.conv_template) == 1:  # Repeat the same template
+        args.conv_template = args.conv_template * len(args.model_path)
+
     # Launch all workers
     workers = []
-    for model_path, model_names in zip(args.model_path, args.model_names):
+    for conv_template, model_path, model_names in zip(
+        args.conv_template, args.model_path, args.model_names
+    ):
         w = ModelWorker(
             args.controller_address,
             args.worker_address,
@@ -219,6 +233,7 @@ def create_multi_model_worker():
             cpu_offloading=args.cpu_offloading,
             gptq_config=gptq_config,
             stream_interval=args.stream_interval,
+            conv_template=conv_template,
         )
         workers.append(w)
         for model_name in model_names:
diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index 7454ca012..7239db5bb 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -460,7 +460,11 @@ async def chat_completion_stream_generator(
                 return
             decoded_unicode = content["text"].replace("\ufffd", "")
             delta_text = decoded_unicode[len(previous_text) :]
-            previous_text = decoded_unicode
+            previous_text = (
+                decoded_unicode
+                if len(decoded_unicode) > len(previous_text)
+                else previous_text
+            )
 
             if len(delta_text) == 0:
                 delta_text = None
@@ -579,7 +583,11 @@ async def generate_completion_stream_generator(
                     return
                 decoded_unicode = content["text"].replace("\ufffd", "")
                 delta_text = decoded_unicode[len(previous_text) :]
-                previous_text = decoded_unicode
+                previous_text = (
+                    decoded_unicode
+                    if len(decoded_unicode) > len(previous_text)
+                    else previous_text
+                )
                 # todo: index is not apparent
                 choice_data = CompletionResponseStreamChoice(
                     index=i,
diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index 8e255b79c..2fe8e6304 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -210,6 +210,8 @@ async def api_model_details(request: Request):
         args.model = args.model_path
     if args.num_gpus > 1:
         args.tensor_parallel_size = args.num_gpus
+    if args.quantizaiton:
+        args.quantization = args.quantization
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngine.from_engine_args(engine_args)
diff --git a/fastchat/train/train.py b/fastchat/train/train.py
index a2c461d78..89dff81dd 100644
--- a/fastchat/train/train.py
+++ b/fastchat/train/train.py
@@ -69,13 +69,15 @@ def rank0_print(*args):
         print(*args)
 
 
-def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
-    """Collects the state dict and dump to disk."""
-    state_dict = trainer.model.state_dict()
-    if trainer.args.should_save:
-        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
-        del state_dict
-        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+def trainer_save_model_safe(trainer: transformers.Trainer):
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    from torch.distributed.fsdp import StateDictType, FullStateDictConfig
+
+    save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+    with FSDP.state_dict_type(
+        trainer.model, StateDictType.FULL_STATE_DICT, save_policy
+    ):
+        trainer.save_model()
 
 
 def preprocess(
@@ -279,9 +281,11 @@ def train():
         trainer.train(resume_from_checkpoint=True)
     else:
         trainer.train()
+
+    # Save model
     model.config.use_cache = True
     trainer.save_state()
-    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+    trainer_save_model_safe(trainer)
 
 
 if __name__ == "__main__":
diff --git a/fastchat/train/train_mem.py b/fastchat/train/train_mem.py
index e4b335284..9ce4913aa 100644
--- a/fastchat/train/train_mem.py
+++ b/fastchat/train/train_mem.py
@@ -1,7 +1,7 @@
 # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 
 # Need to call this before importing transformers.
-from fastchat.train.llama_flash_attn_monkey_patch import (
+from fastchat.train.llama2_flash_attn_monkey_patch import (
     replace_llama_attn_with_flash_attn,
 )
 
diff --git a/fastchat/utils.py b/fastchat/utils.py
index 180cc35c8..947d8b687 100644
--- a/fastchat/utils.py
+++ b/fastchat/utils.py
@@ -57,18 +57,20 @@ def build_logger(logger_name, logger_filename):
     logger = logging.getLogger(logger_name)
     logger.setLevel(logging.INFO)
 
-    os.makedirs(LOGDIR, exist_ok=True)
-    filename = os.path.join(LOGDIR, logger_filename)
-    handler = logging.handlers.TimedRotatingFileHandler(
-        filename, when="D", utc=True, encoding="utf-8"
-    )
-    handler.setFormatter(formatter)
-
-    for l in [stdout_logger, stderr_logger, logger]:
-        if l in visited_loggers:
-            continue
-        visited_loggers.add(l)
-        l.addHandler(handler)
+    # if LOGDIR is empty, then don't try output log to local file
+    if LOGDIR != "":
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when="D", utc=True, encoding="utf-8"
+        )
+        handler.setFormatter(formatter)
+
+        for l in [stdout_logger, stderr_logger, logger]:
+            if l in visited_loggers:
+                continue
+            visited_loggers.add(l)
+            l.addHandler(handler)
 
     return logger
 
@@ -199,6 +201,20 @@ def pretty_print_semaphore(semaphore):
 """
 
 
+get_window_url_params_with_tos_js = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log("url_params", url_params);
+
+    msg = "Users of this website are required to agree to the following terms:\\nThe service is a research preview. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes.\\nThe service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) license."
+    alert(msg);
+
+    return url_params;
+    }
+"""
+
+
 def iter_over_async(
     async_gen: AsyncGenerator, event_loop: AbstractEventLoop
 ) -> Generator:
@@ -300,3 +316,18 @@ def get_context_length(config):
         if val is not None:
             return int(rope_scaling_factor * val)
     return 2048
+
+
+def str_to_torch_dtype(dtype: str):
+    import torch
+
+    if dtype is None:
+        return None
+    elif dtype == "float32":
+        return torch.float32
+    elif dtype == "float16":
+        return torch.float16
+    elif dtype == "bfloat16":
+        return torch.bfloat16
+    else:
+        raise ValueError(f"Unrecognized dtype: {dtype}")
diff --git a/pyproject.toml b/pyproject.toml
index 04c41bb07..419663aa3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "fschat"
-version = "0.2.26"
+version = "0.2.29"
 description = "An open platform for training, serving, and evaluating large language model based chatbots."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -20,7 +20,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0"]
+model_worker = ["accelerate>=0.21", "peft", "sentencepiece", "torch", "transformers>=4.31.0", "protobuf"]
 webui = ["gradio"]
 train = ["einops", "flash-attn>=2.0", "wandb"]
 llm_judge = ["openai", "anthropic>=0.3", "ray"]
diff --git a/tests/launch_openai_api_test_server.py b/tests/launch_openai_api_test_server.py
index ae21869a2..f555a3882 100644
--- a/tests/launch_openai_api_test_server.py
+++ b/tests/launch_openai_api_test_server.py
@@ -13,17 +13,23 @@ def launch_process(cmd):
     launch_process("python3 -m fastchat.serve.openai_api_server")
 
     models = [
-        "lmsys/vicuna-7b-v1.3",
-        "lmsys/fastchat-t5-3b-v1.0",
-        "THUDM/chatglm-6b",
-        "mosaicml/mpt-7b-chat",
+        ("lmsys/vicuna-7b-v1.5", "model_worker"),
+        ("lmsys/fastchat-t5-3b-v1.0", "model_worker"),
+        ("THUDM/chatglm-6b", "model_worker"),
+        ("mosaicml/mpt-7b-chat", "model_worker"),
+        ("meta-llama/Llama-2-7b-chat-hf", "vllm_worker"),
     ]
 
-    for i, model_path in enumerate(models):
-        launch_process(
-            f"CUDA_VISIBLE_DEVICES={i} python3 -m fastchat.serve.model_worker "
-            f"--model-path {model_path} --port {30000+i} --worker http://localhost:{30000+i}"
+    for i, (model_path, worker_name) in enumerate(models):
+        cmd = (
+            f"CUDA_VISIBLE_DEVICES={i} python3 -m fastchat.serve.{worker_name} "
+            f"--model-path {model_path} --port {30000+i} "
+            f"--worker-address http://localhost:{30000+i} "
         )
+        if worker_name == "vllm_worker":
+            cmd += "--tokenizer hf-internal-testing/llama-tokenizer"
+
+        launch_process(cmd)
 
     while True:
         pass
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4b8dbcc19..dcefa4bbe 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -7,14 +7,13 @@
 
 def test_single_gpu():
     models = [
-        "lmsys/vicuna-7b-v1.3",
+        "lmsys/vicuna-7b-v1.5",
         "lmsys/longchat-7b-16k",
         "lmsys/fastchat-t5-3b-v1.0",
+        "meta-llama/Llama-2-7b-chat-hf",
         "THUDM/chatglm-6b",
         "THUDM/chatglm2-6b",
         "mosaicml/mpt-7b-chat",
-        "project-baize/baize-v2-7b",
-        "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b",
         "tiiuae/falcon-7b-instruct",
         "~/model_weights/alpaca-7b",
         "~/model_weights/RWKV-4-Raven-7B-v11x-Eng99%-Other1%-20230429-ctx8192.pth",
diff --git a/tests/test_openai_api.py b/tests/test_openai_api.py
index 87e8af4ec..d79af8322 100644
--- a/tests/test_openai_api.py
+++ b/tests/test_openai_api.py
@@ -59,7 +59,7 @@ def test_chat_completion_stream(model):
     print()
 
 
-def test_openai_curl(model):
+def test_openai_curl():
     run_cmd("curl http://localhost:8000/v1/models")
 
     run_cmd(
@@ -67,7 +67,7 @@ def test_openai_curl(model):
 curl http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "messages": [{"role": "user", "content": "Hello! What is your name?"}]
   }'
 """
@@ -78,7 +78,7 @@ def test_openai_curl(model):
 curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "prompt": "Once upon a time",
     "max_tokens": 41,
     "temperature": 0.5
@@ -91,7 +91,7 @@ def test_openai_curl(model):
 curl http://localhost:8000/v1/embeddings \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "input": "Hello world!"
   }'
 """
@@ -106,9 +106,12 @@ def test_openai_curl(model):
         print(f"===== Test {model} ======")
         test_completion(model)
         test_completion_stream(model)
-        test_embedding(model)
         test_chat_completion(model)
         test_chat_completion_stream(model)
+        try:
+            test_embedding(model)
+        except openai.error.APIError as e:
+            print(f"Embedding error: {e}")
 
     print("===== Test curl =====")
-    test_openai_curl("vicuna-7b-v1.3")
+    test_openai_curl()