huggingface · qgallouedec · Sep 16, 2024 · Sep 15, 2024 · Sep 16, 2024 · Sep 16, 2024
diff --git a/docs/source/kto_trainer.mdx b/docs/source/kto_trainer.mdx
@@ -64,11 +64,16 @@ The `beta` refers to the hyperparameter of the implicit reward, and the dataset
 The `desirable_weight` and `undesirable_weight` refer to the weights placed on the losses for desirable/positive and undesirable/negative examples.
 By default, they are both 1. However, if you have more of one or the other, then you should upweight the less common type such that the ratio of (`desirable_weight` \\(\times\\) number of positives) to (`undesirable_weight` \\(\times\\) number of negatives) is in the range 1:1 to 4:3.
 
+<Tip>
+It is strongly recommended you use a learning rate between `5e-7` and `5e-6` with an effective batch size between `8` and `32`, for both LoRA and full finetuning. Even if you are working with a small dataset, we do not recommend using a learning rate outside this range; instead, using smaller batch sizes and/or more training epochs will give you better results.
+</Tip>
+
 ```py
 training_args = KTOConfig(
     beta=0.1,
     desirable_weight=1.0,
     undesirable_weight=1.0,
+    learning_rate=5e-7,
 )
 
 kto_trainer = KTOTrainer(

diff --git a/examples/scripts/kto.py b/examples/scripts/kto.py
@@ -20,7 +20,7 @@
     --model_name_or_path=trl-lib/qwen1.5-1.8b-sft \
     --per_device_train_batch_size 16 \
     --num_train_epochs 1 \
-    --learning_rate 1e-5 \
+    --learning_rate 5e-7 \
     --lr_scheduler_type=cosine \
     --gradient_accumulation_steps 1 \
     --logging_steps 10 \
@@ -36,7 +36,7 @@
     --model_name_or_path=trl-lib/qwen1.5-1.8b-sft \
     --per_device_train_batch_size 8 \
     --num_train_epochs 1 \
-    --learning_rate 1e-4 \
+    --learning_rate 5e-7 \
     --lr_scheduler_type=cosine \
     --gradient_accumulation_steps 1 \
     --logging_steps 10 \

diff --git a/trl/trainer/kto_config.py b/trl/trainer/kto_config.py
@@ -27,6 +27,8 @@ class KTOConfig(TrainingArguments):
     command line.
 
     Parameters:
+        learning_rate (`float`, *optional*, defaults to `5e-7`):
+            Initial learning rate for [`AdamW`] optimizer. The default value replaces that of [`~transformers.TrainingArguments`].
         max_length (`Optional[int]`, *optional*, defaults to `None`):
             Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
             to use the default data collator.
@@ -74,6 +76,7 @@ class KTOConfig(TrainingArguments):
             Number of processes to use for processing the dataset.
     """
 
+    learning_rate: float = 5e-7
     max_length: Optional[int] = None
     max_prompt_length: Optional[int] = None
     max_completion_length: Optional[int] = None