diff --git a/examples/datasets/tldr_preference.py b/examples/datasets/tldr_preference.py index f7c02a8a97..7c1af328a5 100644 --- a/examples/datasets/tldr_preference.py +++ b/examples/datasets/tldr_preference.py @@ -63,8 +63,9 @@ def process(row): format_str = cnndm_format_str if row["batch"] in cnndm_batches else tldr_format_str row["prompt"] = format_str.format(**row["info"]) choice = row["choice"] - chosen = row["summaries"][choice]["text"] - rejected = row["summaries"][1 - choice]["text"] + # need to remove the leading space + chosen = row["summaries"][choice]["text"].strip() + rejected = row["summaries"][1 - choice]["text"].strip() row["chosen"] = [{"role": "user", "content": row["prompt"]}, {"role": "assistant", "content": chosen}] row["rejected"] = [{"role": "user", "content": row["prompt"]}, {"role": "assistant", "content": rejected}] return row