huggingface · dame-cell · Nov 30, 2024 · Nov 30, 2024 · Nov 30, 2024 · Dec 1, 2024
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
@@ -472,7 +472,7 @@ def test_dpo_trainer_padding_token_is_none(self):
 
                 trainer.train()
 
-    def test_dpo_trainer_w_dataset_num_proc(self):
+    def test_dpo_trainer_padding_free_training(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = DPOConfig(
                 output_dir=tmp_dir,
@@ -483,74 +483,22 @@ def test_dpo_trainer_w_dataset_num_proc(self):
                 learning_rate=9e-1,
                 eval_strategy="steps",
                 beta=0.1,
-                dataset_num_proc=5,
                 report_to="none",
             )
 
             dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_preference")
-
-            tokenizer = AutoTokenizer.from_pretrained(self.model_id)
-            tokenizer.pad_token = None
-
-            with self.assertRaisesRegex(
-                ValueError,
-                expected_regex=r"Can't find `pad_token_id` in the `processing_class`. "
-                r"Explicitly set `tokenizer.pad_token` \(e.g. `tokenizer.pad_token = tokenizer.eos_token`\) "
-                r"before instantiating the trainer.",
-            ):
-                trainer = DPOTrainer(
-                    model=self.model,
-                    ref_model=None,
-                    args=training_args,
-                    processing_class=tokenizer,
-                    train_dataset=dummy_dataset["train"],
-                    eval_dataset=dummy_dataset["test"],
-                )
-
-                trainer.train()
-
-    def test_tr_dpo_trainer(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            training_args = DPOConfig(
-                output_dir=tmp_dir,
-                per_device_train_batch_size=2,
-                max_steps=3,
-                remove_unused_columns=False,
-                gradient_accumulation_steps=4,
-                learning_rate=9e-1,
-                eval_strategy="steps",
-                precompute_ref_log_probs=False,
-                sync_ref_model=True,
-                ref_model_mixup_alpha=0.5,
-                ref_model_sync_steps=1,
-                report_to="none",
-            )
-
-            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_preference")
-
             trainer = DPOTrainer(
                 model=self.model,
-                ref_model=self.ref_model,
+                ref_model=None,
                 args=training_args,
-                processing_class=self.tokenizer,
+                tokenizer=self.tokenizer,
+                padding_free=True,
                 train_dataset=dummy_dataset["train"],
                 eval_dataset=dummy_dataset["test"],
             )
 
-            # params of the ref model as its the same as the model
-            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
-
             trainer.train()
 
-            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
-
-            # check the params have changed
-            for n, param in previous_trainable_params.items():
-                new_param = trainer.ref_model.get_parameter(n)
-                # check the ref model's params have changed - ignore 0 biases
-                if param.sum() != 0:
-                    self.assertFalse(torch.equal(param, new_param))
-
     @require_no_wandb
     def test_dpo_trainer_generate_during_eval_no_wandb(self):
         with tempfile.TemporaryDirectory() as tmp_dir:

diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py
@@ -145,6 +145,8 @@ class DPOConfig(TrainingArguments):
             for saving memory and speeding up training by not computing the logits for all tokens, especially in scenarios
             when working with very long prompts where labels are -ignored (-100).
             [Read more](https://huggingface.co/docs/transformers/main/model_doc/llama#transformers.LlamaForCausalLM)
+        padding_free (`bool`, defaults to `False`):
+            Whether to use padding-free training. If set to `True`, the trainer will operate in a padding-free mode.
     """
 
     learning_rate: float = 1e-6
@@ -192,3 +194,4 @@ class DPOConfig(TrainingArguments):
     rpo_alpha: Optional[float] = None
     discopop_tau: float = 0.05
     use_num_logits_to_keep: bool = False
+    padding_free: bool = False
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
@@ -192,6 +192,7 @@ class DPOTrainer(Trainer):
             The function to use to preprocess the logits before computing the metrics.
         peft_config (`dict`, defaults to `None`):
             The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
+
     """
 
     _tag_names = ["trl", "dpo"]
@@ -347,7 +348,7 @@ def make_inputs_require_grad(module, input, output):
         self.model_adapter_name = args.model_adapter_name
         self.ref_adapter_name = args.ref_adapter_name
         self.reference_free = args.reference_free
-
+        self.padding_free = args.padding_free
         if ref_model:
             self.ref_model = ref_model
         elif self.is_peft_model or args.precompute_ref_log_probs:
@@ -1068,8 +1069,10 @@ def dpo_loss(
 
     def concatenated_forward(self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]):
         """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
-
         We do this to avoid doing two forward passes, because it's faster for FSDP.
+        When padding_free=True, sequences are concatenated without padding tokens and processed as a single
+        continuous sequence with reset position IDs, improving memory efficiency and computation speed.
+        When False, uses standard padded batch processing.
         """
         num_examples = batch["prompt_input_ids"].shape[0]
 
@@ -1121,55 +1124,99 @@ def concatenated_forward(self, model: nn.Module, batch: dict[str, Union[list, to
                 attention_mask[i] = torch.roll(attention_mask[i], shifts=-first_one_idx)
                 loss_mask[i] = torch.roll(loss_mask[i], shifts=-first_one_idx)
 
-            # Get the first column idx that is all zeros and remove every column after that
-            empty_cols = torch.sum(attention_mask, dim=0) == 0
-            first_empty_col = torch.nonzero(empty_cols)[0].item() if empty_cols.any() else attention_mask.size(1)
-            input_ids = input_ids[:, :first_empty_col]
-            attention_mask = attention_mask[:, :first_empty_col]
-            loss_mask = loss_mask[:, :first_empty_col]
+            if self.padding_free:
+                # Pre-calculate sequence lengths
+                seq_lengths = attention_mask.sum(1)
 
-            # Truncate right
-            if self.args.max_length is not None:
+                # we should apply it to individual sequence lengths before concatenation
+                if self.args.max_length is not None:
+                    seq_lengths = torch.clamp(seq_lengths, max=self.args.max_length)
+
+                # truncate the input_ids as well based on the input_ids
                 input_ids = input_ids[:, : self.args.max_length]
-                attention_mask = attention_mask[:, : self.args.max_length]
-                loss_mask = loss_mask[:, : self.args.max_length]
-
-            if self.use_num_logits_to_keep:
-                # Compute num_logits_to_keep based on loss_mask pattern:
-                # [[0, 0, 0, x, x, x, x],
-                #  [0, 0, 0, x, x, x, 0]]
-                #         ^ start computing logits from here ([:, -(7-3+1):])
-                first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min()
-                num_logits_to_keep = loss_mask.shape[1] - first_compute_index
-                model_kwargs["num_logits_to_keep"] = num_logits_to_keep.item() + 1  # +1 for the first label
-
-            outputs = model(input_ids=input_ids, attention_mask=attention_mask, **model_kwargs)
-
-            # Offset the logits by one to align with the labels
-            logits = outputs.logits[:, :-1, :]
-            labels = input_ids[:, 1:].clone()
-            loss_mask = loss_mask[:, 1:].bool()
-
-            if self.use_num_logits_to_keep:
-                # Align labels with logits
-                # logits:    -,  -, [x2, x3, x4, x5, x6]
-                #                     ^ --------- ^       after logits[:, :-1, :]
-                # labels:   [y0, y1, y2, y3, y4, y5, y6]
-                #                         ^ --------- ^   with num_logits_to_keep=4, [:, -4:]
-                # loss_mask: [0,  0,  0,  1,  1,  1,  1]
-                labels = labels[:, -num_logits_to_keep:]
-                loss_mask = loss_mask[:, -num_logits_to_keep:]
-
-        if logits.shape[:2] != labels.shape[:2]:
-            # for llava, the returned logits include the image tokens (placed before the text tokens)
-            seq_len = labels.shape[1]
-            logits = logits[:, -seq_len:]
-
-        # Compute the log probabilities of the labels
-        labels[~loss_mask] = 0  # dummy token; we'll ignore the losses on these tokens later
-        per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
-        per_token_logps[~loss_mask] = 0
-        all_logps = per_token_logps.sum(-1)
+                total_length = seq_lengths.sum().item()
+
+                # Pre-allocate tensors
+                concatenated_input_ids = torch.zeros(total_length, dtype=input_ids.dtype, device=input_ids.device)
+
+                # Fill tensors
+                current_idx = 0
+                sequence_boundaries = []
+                for i in range(input_ids.size(0)):
+                    length = seq_lengths[i].item()
+                    valid_tokens = input_ids[i, :length]
+                    concatenated_input_ids[current_idx : current_idx + length] = valid_tokens
+                    sequence_boundaries.append((current_idx, current_idx + length))
+                    current_idx += length
+
+                # Create position ids
+                position_ids = torch.arange(total_length, device=input_ids.device)
+
+                # remove attention mask
+                model_kwargs.pop("attention_mask", None)
+
+                outputs = model(
+                    input_ids=concatenated_input_ids.unsqueeze(0),
+                    position_ids=position_ids.unsqueeze(0),
+                    **model_kwargs,
+                )
+
+                # Process outputs
+                logits = outputs.logits[0, :-1, :]
+                labels = concatenated_input_ids[1:].clone()
+
+                # Calculate per-token log probabilities
+                per_token_logps = torch.gather(logits.log_softmax(-1), dim=-1, index=labels.unsqueeze(-1)).squeeze(-1)
+
+                # Split sequences back into batch
+                start_idx = 0
+                batch_logps = []
+                for length in seq_lengths:
+                    sequence_logps = per_token_logps[start_idx : start_idx + length - 1]
+                    batch_logps.append(sequence_logps.sum())
+                    start_idx += length
+
+                all_logps = torch.stack(batch_logps)
+
+            else:
+                # Truncate right
+                if self.args.max_length is not None:
+                    input_ids = input_ids[: self.args.max_length]
+                    attention_mask = attention_mask[: self.args.max_length]
+                    loss_mask = loss_mask[: self.args.max_length]
+
+                if self.use_num_logits_to_keep:
+                    # Compute num_logits_to_keep based on loss_mask pattern:
+                    # [[0, 0, 0, x, x, x, x],
+                    #  [0, 0, 0, x, x, x, 0]]
+                    #         ^ start computing logits from here ([:, -(7-3+1):])
+                    first_compute_index = loss_mask.nonzero(as_tuple=True)[0].min()
+                    num_logits_to_keep = loss_mask.shape[0] - first_compute_index
+                    model_kwargs["num_logits_to_keep"] = num_logits_to_keep.item() + 1  # +1 for the first label
+
+                # Get the first column idx that is all zeros and remove every column after that
+                empty_cols = torch.sum(attention_mask, dim=0) == 0
+                first_empty_col = torch.nonzero(empty_cols)[0].item() if empty_cols.any() else attention_mask.size(0)
+                input_ids = input_ids[:first_empty_col]
+                attention_mask = attention_mask[:first_empty_col]
+                loss_mask = loss_mask[:first_empty_col]
+
+                outputs = model(input_ids=input_ids, attention_mask=attention_mask, **model_kwargs)
+
+                # Offset the logits by one to align with the labels
+                logits = outputs.logits[:-1, :]
+                labels = input_ids[1:].clone()
+                loss_mask = loss_mask[1:]
+
+                if logits.shape[:2] != labels.shape[:2]:
+                    # for llava, the returned logits include the image tokens (placed before the text tokens)
+                    seq_len = labels.shape[1]
+                    logits = logits[:, -seq_len:]
+
+                labels[~loss_mask] = 0  # dummy token; we'll ignore the losses on these tokens later
+                per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
+                per_token_logps[~loss_mask] = 0
+                all_logps = per_token_logps.sum(-1)
 
         output = {}
 

diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py
@@ -65,5 +65,6 @@ class PPOConfig(OnPolicyConfig):
     cliprange: float = 0.2
     vf_coef: float = 0.1
     cliprange_value: float = 0.2
+    """Clip range for the value function."""
     gamma: float = 1.0
     lam: float = 0.95
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -142,7 +142,6 @@ def __init__(
             None  # disable `pad_token_id` and `eos_token_id` because we just want to
         )
         self.policy_model.generation_config.pad_token_id = None  # generate tokens without truncation / padding
-
         # peft support
         if not is_peft_available() and peft_config is not None:
             raise ImportError(