fix: address review comments

Signed-off-by: Mehant Kammakomati <mehant.kammakomati2@ibm.com>
huggingface · Nov 5, 2024 · 1ae3f93 · 1ae3f93
1 parent a9233fd
commit 1ae3f93
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 6 deletions.
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
@@ -366,12 +366,12 @@ def _prepare_dataset(
                 warnings.warn(
                     "You passed a dataset that is already processed (contains an `input_ids` field) together with a valid formatting function. Therefore `formatting_func` will be ignored."
                 )
-            formatting_func = lambda x: x["input_ids"]
+
+            def formatting_func(x):
+                return x["input_ids"]
+
             if not packing:
                 return dataset
-            warnings.warn(
-                "Since packing is set to True, though the dataset is pretokenized, it will undergo constant length dataset preparation."
-            )
 
         # check if torch dataset / dataloader and do nothing
         # see https://github.com/huggingface/trl/pull/1468 for why datasets.IterableDataset needs a separate check

diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
@@ -632,9 +632,9 @@ def __init__(
         column_names = (
             dataset.column_names if isinstance(dataset, (datasets.Dataset, datasets.IterableDataset)) else None
         )
-        if column_names and "input_ids" in column_names:
+        if column_names is not None and "input_ids" in column_names:
             self.pretokenized = True
-            # since its tokenized unit of buffer size should be tokens
+            # since the dataset is tokenized, the unit of buffer size should be tokens
             self.max_buffer_size = seq_length * num_of_sequences
 
     def __len__(self):