huggingface · kawine · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/docs/source/kto_trainer.mdx b/docs/source/kto_trainer.mdx
@@ -7,6 +7,7 @@ For a full example have a look at  [`examples/scripts/kto.py`].
 
 Depending on how good your base model is, you may or may not need to do SFT before KTO.
 This is different from standard RLHF and DPO, which always require SFT.
+You can also train with imbalanced data (more chosen than rejected examples, or vice-versa), but you will need to adjust hyperparameters accordingly (see below).
 
 ## Expected dataset format
 
@@ -51,7 +52,8 @@ kto_dataset_dict = {
 ```
 
 where the `prompt` contains the context inputs, `completion` contains the corresponding responses and `label` contains the corresponding flag that indicates if the generated completion is desired (`True`) or undesired (`False`).
-A prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays. It is required that the dataset contains at least one desirable and one undesirable completion.
+A prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays. 
+In theory, the dataset must contain at least one desirable and one undesirable completion; however, some people have had success running KTO on _only_ desirable or undesirable data (in the latter case, it is best to use a conservative learning rate).
 
 
 ## Expected model format
@@ -61,13 +63,17 @@ The KTO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that
 
 For a detailed example have a look at the `examples/scripts/kto.py` script. At a high level we need to initialize the `KTOTrainer` with a `model` we wish to train and a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response. 
 
-The `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).
+The `beta` refers to the hyperparameter that controls how quickly the loss saturates, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).
 
 The `desirable_weight` and `undesirable_weight` refer to the weights placed on the losses for desirable/positive and undesirable/negative examples.
 By default, they are both 1. However, if you have more of one or the other, then you should upweight the less common type such that the ratio of (`desirable_weight` \\(\times\\) number of positives) to (`undesirable_weight` \\(\times\\) number of negatives) is in the range 1:1 to 4:3.
 
 <Tip>
-It is strongly recommended you use a learning rate between `5e-7` and `5e-6` with an effective batch size between `8` and `32`, for both LoRA and full finetuning. Even if you are working with a small dataset, we do not recommend using a learning rate outside this range; instead, using smaller batch sizes and/or more training epochs will give you better results.
+Every choice of `beta` has a maximum learning rate it will tolerate before learning degenerates. For the default `beta = 0.1', this learning rate is `1e-6` for most models. The lower beta is, the lower your learning rate should be. In general, we strongly recommend a learning rate between `5e-7` and `5e-6`. Even if you are working with a small dataset, we do not recommend using a learning rate outside this range; instead, use more epochs. 
+</Tip>
+
+<Tip>
+Use a per-step batch size that is at least 4, and an effective batch size between 16 and 128. Even if your effective batch size is large, if your per-step batch size is poor, then the KL estimate in KTO will be poor.
 </Tip>
 
 ```py

diff --git a/examples/scripts/kto.py b/examples/scripts/kto.py
@@ -20,7 +20,7 @@
     --model_name_or_path=trl-lib/qwen1.5-1.8b-sft \
     --per_device_train_batch_size 16 \
     --num_train_epochs 1 \
-    --learning_rate 5e-7 \
+    --learning_rate 1e-6 \
     --lr_scheduler_type=cosine \
     --gradient_accumulation_steps 1 \
     --logging_steps 10 \
@@ -36,7 +36,7 @@
     --model_name_or_path=trl-lib/qwen1.5-1.8b-sft \
     --per_device_train_batch_size 8 \
     --num_train_epochs 1 \
-    --learning_rate 5e-7 \
+    --learning_rate 1e-6 \
     --lr_scheduler_type=cosine \
     --gradient_accumulation_steps 1 \
     --logging_steps 10 \
@@ -98,16 +98,16 @@ class ScriptArguments:
     dataset = load_dataset(script_args.dataset_name)
 
     # If needed, reformat a DPO-formatted dataset (prompt, chosen, rejected) to a KTO-format (prompt, completion, label)
-    dataset = maybe_unpair_preference_dataset(dataset, num_proc=training_args.dataset_num_proc)
+    # dataset = maybe_unpair_preference_dataset(dataset, num_proc=training_args.dataset_num_proc)
 
     # Apply chat template
     def format_dataset(example):
         if isinstance(example["completion"], str):
             example["prompt"] = tokenizer.apply_chat_template(example["prompt"], tokenize=False)
             example["completion"] = tokenizer.apply_chat_template(example["completion"], tokenize=False)
         else:
-            example["prompt"] = tokenizer.apply_chat_template(example["completion"][:-1], tokenize=False)
-            example["completion"] = tokenizer.apply_chat_template([example["completion"][-1]], tokenize=False)
+            example["prompt"] = tokenizer.apply_chat_template(example["prompt"], tokenize=False)
+            example["completion"] = tokenizer.apply_chat_template(example["completion"], tokenize=False)
         return example
 
     # Compute that only on the main process for faster data processing.

diff --git a/trl/trainer/kto_config.py b/trl/trainer/kto_config.py
@@ -77,7 +77,7 @@ class KTOConfig(TrainingArguments):
             Number of processes to use for processing the dataset.
     """
 
-    learning_rate: float = 5e-7
+    learning_rate: float = 1e-6
     max_length: Optional[int] = None
     max_prompt_length: Optional[int] = None
     max_completion_length: Optional[int] = None
@@ -90,6 +90,7 @@ class KTOConfig(TrainingArguments):
     truncation_mode: str = "keep_end"
     generate_during_eval: bool = False
     is_encoder_decoder: Optional[bool] = None
+    disable_dropout: bool = True
     precompute_ref_log_probs: bool = False
     model_init_kwargs: Optional[Dict[str, Any]] = None
     ref_model_init_kwargs: Optional[Dict[str, Any]] = None