huggingface · lewtun · Aug 28, 2024 · Jul 3, 2024 · Jul 5, 2024 · Jul 9, 2024
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -31,12 +31,12 @@
     title: PPOv2 Trainer
   - local: rloo_trainer
     title: RLOO Trainer
-  - local: online_dpo_trainer
-    title: Online DPO Trainer
   - local: best_of_n
     title: Best of N Sampling
   - local: dpo_trainer
     title: DPO Trainer
+  - local: online_dpo_trainer
+    title: Online DPO Trainer
   - local: kto_trainer
     title: KTO Trainer
   - local: bco_trainer

diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
diff --git a/examples/scripts/dpo_online.py b/examples/scripts/dpo_online.py
@@ -0,0 +1,108 @@
+# flake8: noqa
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+python examples/scripts/dpo_online.py \
+    --model_name_or_path trl-lib/pythia-1b-deduped-tldr-sft  \
+    --reward_model_path trl-lib/pythia-1b-deduped-tldr-rm \
+    --dataset_name trl-lib/tldr \
+    --learning_rate 5.0e-7 \
+    --output_dir pythia-1b-tldr-online-dpo \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 32 \
+    --num_train_epochs 3 \
+    --completion_length 53 \
+    --warmup_ratio 0.1 \
+    --missing_eos_penalty 1.0 \
+    --push_to_hub
+"""
+
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+from accelerate import PartialState
+from trl import (
+    DPOScriptArguments,
+    ModelConfig,
+    OnlineDPOConfig,
+    OnlineDPOTrainer,
+    get_kbit_device_map,
+    get_quantization_config,
+)
+from trl.commands.cli_utils import TrlParser
+from trl.trainer.callbacks import LogCompletionsCallback
+from trl.trainer.utils import SIMPLE_QUERY_CHAT_TEMPLATE
+
+if __name__ == "__main__":
+    parser = TrlParser((DPOScriptArguments, OnlineDPOConfig, ModelConfig))
+    args, training_args, model_config = parser.parse_args_and_config()
+    args.gradient_checkpointing_kwargs = {"use_reentrant": True}
+
+    torch_dtype = (
+        model_config.torch_dtype
+        if model_config.torch_dtype in ["auto", None]
+        else getattr(torch, model_config.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_config)
+    model_kwargs = dict(
+        revision=model_config.model_revision,
+        attn_implementation=model_config.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_config.model_name_or_path, trust_remote_code=model_config.trust_remote_code, **model_kwargs
+    )
+    ref_model = AutoModelForCausalLM.from_pretrained(
+        model_config.model_name_or_path, trust_remote_code=model_config.trust_remote_code, **model_kwargs
+    )
+    reward_model = AutoModelForSequenceClassification.from_pretrained(
+        training_args.reward_model_path, num_labels=1, trust_remote_code=model_config.trust_remote_code
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_config.model_name_or_path,
+        padding_side="left",
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    if tokenizer.chat_template is None:
+        tokenizer.chat_template = SIMPLE_QUERY_CHAT_TEMPLATE
+
+    dataset = load_dataset(args.dataset_name)
+
+    def prepare_dataset(row):
+        row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False, add_generation_prompt=True)
+        return row
+
+    with PartialState().local_main_process_first():
+        dataset = dataset.map(prepare_dataset, num_proc=training_args.dataset_num_proc)
+
+    prompts = dataset[args.dataset_test_split]["prompt"][:8]
+
+    trainer = OnlineDPOTrainer(
+        model=model,
+        ref_model=ref_model,
+        reward_model=reward_model,
+        args=training_args,
+        train_dataset=dataset[args.dataset_train_split],
+        eval_dataset=dataset[args.dataset_test_split],
+        tokenizer=tokenizer,
+    )
+    log_completions_callback = LogCompletionsCallback(prompts)
+    trainer.add_callback(log_completions_callback)
+    trainer.train()
diff --git a/examples/scripts/online_dpo.py b/examples/scripts/online_dpo.py
diff --git a/tests/test_online_dpo_trainer.py b/tests/test_online_dpo_trainer.py
@@ -28,11 +28,6 @@ def setUp(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
-    def _get_dummy_model_and_tokenizer(self):
-        # Return dummy model and tokenizer. This is a placeholder.
-        return self.model, self.tokenizer, self.reward_model
-
-    def _init_dummy_dataset(self):
         # fmt: off
         dummy_dataset_dict = {
             "prompt": [
@@ -70,31 +65,13 @@ def _init_dummy_dataset(self):
             ],
         }
         # fmt: on
-        return Dataset.from_dict(dummy_dataset_dict)
+        self.dummy_dataset = Dataset.from_dict(dummy_dataset_dict)
 
     @unittest.skip(
         "This test fails with the latest transformers version. We skip it as we are about "
         "to refactor the `OnlineDPOTrainer`. See PR #1839."
     )
     def test_online_dpo_trainer_training(self):
-        model, tokenizer, reward_model = self._get_dummy_model_and_tokenizer()
-        dummy_dataset = self._init_dummy_dataset()
-
-        def tokenize(element):
-            outputs = tokenizer(
-                element["prompt"],
-                padding=False,
-            )
-            return {"input_ids": outputs["input_ids"]}
-
-        dummy_dataset = dummy_dataset.map(
-            tokenize,
-            remove_columns=dummy_dataset.column_names,
-            batched=True,
-            num_proc=4,
-            load_from_cache_file=False,
-        )
-
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = OnlineDPOConfig(
                 output_dir=tmp_dir,
@@ -108,16 +85,16 @@ def tokenize(element):
             )
 
             trainer = OnlineDPOTrainer(
-                model=model,
-                ref_model=model,
-                reward_model=reward_model,
-                config=training_args,
-                tokenizer=tokenizer,
-                train_dataset=dummy_dataset,
-                eval_dataset=dummy_dataset,
+                model=self.model,
+                ref_model=self.model,
+                reward_model=self.reward_model,
+                args=training_args,
+                tokenizer=self.tokenizer,
+                train_dataset=self.dummy_dataset,
+                eval_dataset=self.dummy_dataset,
             )
 
             trainer.train()
 
             # Check if training loss is available
-            self.assertIn("loss/policy_avg", trainer.state.log_history[-1])
+            self.assertIn("train_loss", trainer.state.log_history[-1])