Skip to content

Commit

Permalink
Add scripts for chat data cleaning and analysis (lm-sys#2335)
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy authored Aug 29, 2023
1 parent 42be87e commit 2fbfcbc
Show file tree
Hide file tree
Showing 12 changed files with 398 additions and 23 deletions.
38 changes: 38 additions & 0 deletions docs/commands/conv_release.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
## Chatbot Arena Conversations

1. Gather battles
```
python3 clean_battle_data.py --max-num 10 --mode conv_release
```

2. Tag OpenAI moderation
```
python3 tag_openai_moderation.py --in clean_battle_conv_20230814.json
```

3. Clean PII

4. Filter additional blocked words

```
python3 filter_bad_conv.py --in clean_battle_conv_20230630_tagged_v1_pii.json
```

5. Add additional toxicity tag


## All Conversations

1. Gather chats
```
python3 clean_chat_data.py
```

2. Sample
```
python3 conv_release_scripts/sample.py
```


## Prompt distribution

2 changes: 0 additions & 2 deletions docs/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,3 @@ deepspeed fastchat/train/train_lora_t5.py \
--deepspeed playground/deepspeed_config_s2.json

```


2 changes: 1 addition & 1 deletion fastchat/data/hardcoded_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def identity_questions():
""" "
Adopted from https://github.com/young-geng/koala_data_pipeline/blob/main/process_hard_coded_data.py
Adapted from https://github.com/young-geng/koala_data_pipeline/blob/main/process_hard_coded_data.py
"""
content = []

Expand Down
16 changes: 10 additions & 6 deletions fastchat/serve/monitor/clean_battle_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@ def to_openai_format(messages):
return ret


def replace_model_name(old_name):
return (
old_name.replace("bard", "palm-2")
.replace("claude-v1", "claude-1")
.replace("claude-instant-v1", "claude-instant-1")
.replace("oasst-sft-1-pythia-12b", "oasst-pythia-12b")
)


def clean_battle_data(log_files):
data = []
for filename in tqdm(log_files, desc="read files"):
Expand Down Expand Up @@ -162,12 +171,7 @@ def clean_battle_data(log_files):
continue

# Replace bard with palm
models = [
m.replace("bard", "palm-2")
.replace("claude-v1", "claude-1")
.replace("claude-instant-v1", "claude-instant-1")
for m in models
]
models = [replace_model_name(m) for m in models]

question_id = row["states"][0]["conv_id"]
conversation_a = to_openai_format(
Expand Down
32 changes: 22 additions & 10 deletions fastchat/serve/monitor/clean_chat_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
Clean chatbot arena battle log.
Clean chatbot arena chat log.
Usage:
python3 clean_battle_data.py --mode conv_release
python3 clean_chat_data.py --mode conv_release
"""
import argparse
import datetime
Expand All @@ -13,8 +13,11 @@

from tqdm import tqdm

from fastchat.serve.monitor.basic_stats import get_log_files, NUM_SERVERS
from fastchat.serve.monitor.clean_battle_data import to_openai_format
from fastchat.serve.monitor.basic_stats import NUM_SERVERS
from fastchat.serve.monitor.clean_battle_data import (
to_openai_format,
replace_model_name,
)
from fastchat.utils import detect_language


Expand All @@ -40,7 +43,7 @@ def get_log_files(max_num_files=None):
if os.path.exists(name):
filenames.append(name)
max_num_files = max_num_files or len(filenames)
filenames = list(reversed(filenames))
# filenames = list(reversed(filenames))
filenames = filenames[-max_num_files:]
return filenames

Expand Down Expand Up @@ -82,6 +85,7 @@ def clean_chat_data(log_files):
if not isinstance(model, str):
ct_invalid += 1
continue
model = replace_model_name(model)

try:
lang_code = detect_language(state["messages"][state["offset"]][1])
Expand Down Expand Up @@ -123,22 +127,30 @@ def clean_chat_data(log_files):
last_updated_tstamp, tz=timezone("US/Pacific")
).strftime("%Y-%m-%d %H:%M:%S %Z")

print(f"#raw: {len(raw_data)}, #chat: {len(chats)}")
# Deduplication
dedup_chats = []
visited_conv_ids = set()
for i in reversed(range(len(chats))):
if chats[i]["conversation_id"] in visited_conv_ids:
continue
visited_conv_ids.add(chats[i]["conversation_id"])
dedup_chats.append(chats[i])

print(
f"#raw: {len(raw_data)}, #chat: {len(chats)}, #dedup_chat: {len(dedup_chats)}"
)
print(
f"#invalid_conv_id: {ct_invalid_conv_id}, #network_error: {ct_network_error}, #invalid: {ct_invalid}"
)
print(f"#models: {len(all_models)}, {all_models}")
print(f"last-updated: {last_updated_datetime}")

return chats
return list(reversed(dedup_chats))


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--max-num-files", type=int)
parser.add_argument(
"--mode", type=str, choices=["simple", "conv_release"], default="simple"
)
args = parser.parse_args()

log_files = get_log_files(args.max_num_files)
Expand Down
21 changes: 21 additions & 0 deletions fastchat/serve/monitor/replace_model_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""
Usage:
python3 replace_model_name.py --in clean_conv_20230809_10k.json
"""

import argparse
import json

from fastchat.serve.monitor.clean_battle_data import replace_model_name

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
args = parser.parse_args()

convs = json.load(open(args.in_file))
for x in convs:
x["model"] = replace_model_name(x["model"])

with open(args.in_file, "w") as fout:
json.dump(convs, fout, indent=2, ensure_ascii=False)
67 changes: 67 additions & 0 deletions fastchat/serve/monitor/summarize_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
Usage:
python3 summarize_cluster.py --in results_c20_kmeans_cluster.pkl --model gpt-4
"""
import argparse
import pickle

from fastchat.llm_judge.common import (
chat_compeletion_openai,
chat_compeletion_anthropic,
)
from fastchat.conversation import get_conv_template


def truncate_string(s, l):
half = int(l // 2)
return s[:half] + s[-half:] if len(s) > l else s


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input-file", type=str, required=True)
parser.add_argument("--model", type=str, default="gpt-3.5-turbo")
parser.add_argument("--num-prompts", type=int, default=100)
args = parser.parse_args()

model = args.model

cluster_infos = pickle.load(open(args.input_file, "rb"))
num_total_prompts = sum([x[0] for x in cluster_infos])

topics = []
percentages = []
for i, info in enumerate(cluster_infos):
num_samples, prompts = info
percentage = num_samples / num_total_prompts
print(
f"cluster {i}, #prompts {num_samples}, percentage: {percentage * 100:.2f}%"
)
instruct = "Given a list of user messages, use less than 8 words to summarize a central topic for all messages in English. Your output should only include a single line. Try to be specific."
prompt = "\n".join(
[truncate_string(x, l=200) for x in prompts[: args.num_prompts]]
)
prompt = "BEGIN OF THE MESSAGE LIST\n" + prompt + "\nEND OF THE MESSAGE LIST."

if "gpt" in model:
template_name = "chatgpt"
completion_func = chat_compeletion_openai
elif "claude" in model:
template_name = "claude"
completion_func = chat_compeletion_anthropic

conv = get_conv_template(template_name)
conv.set_system_message(instruct)
conv.append_message(conv.roles[0], prompt)
conv.append_message(conv.roles[1], None)

topic = completion_func(model, conv, temperature=0, max_tokens=256)
print(topic)

topics.append(topic)
percentages.append(round(percentage, 6))

print()
print(f"topics: {topics}")
print(f"percentages: {percentages}")
Loading

0 comments on commit 2fbfcbc

Please sign in to comment.