From 3479606c8c6dbb5da96e4990b491e63a48fc7483 Mon Sep 17 00:00:00 2001 From: Costa Huang Date: Wed, 26 Jun 2024 03:18:22 -0400 Subject: [PATCH] Remove the leading space in the tldr preference dataset (#1773) --- examples/datasets/tldr_preference.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/datasets/tldr_preference.py b/examples/datasets/tldr_preference.py index f7c02a8a97..7c1af328a5 100644 --- a/examples/datasets/tldr_preference.py +++ b/examples/datasets/tldr_preference.py @@ -63,8 +63,9 @@ def process(row): format_str = cnndm_format_str if row["batch"] in cnndm_batches else tldr_format_str row["prompt"] = format_str.format(**row["info"]) choice = row["choice"] - chosen = row["summaries"][choice]["text"] - rejected = row["summaries"][1 - choice]["text"] + # need to remove the leading space + chosen = row["summaries"][choice]["text"].strip() + rejected = row["summaries"][1 - choice]["text"].strip() row["chosen"] = [{"role": "user", "content": row["prompt"]}, {"role": "assistant", "content": chosen}] row["rejected"] = [{"role": "user", "content": row["prompt"]}, {"role": "assistant", "content": rejected}] return row