huggingface · kashif · Jun 26, 2024 · May 17, 2024 · May 17, 2024 · May 27, 2024
diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
@@ -70,8 +70,33 @@ dpo_dataset_dict = {
 
 where the `prompt` contains the context inputs, `chosen` contains the corresponding chosen responses and `rejected` contains the corresponding negative (rejected) responses. As can be seen a prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays.
 
+`DPOTrainer` can be used to fine-tune visual language models (VLMs). In this case, the dataset must also contain the key `images`. For example, for Idefics2, the processor expects the dataset to have the following format:
+
+Note: Currently, VLM support is exclusive to Idefics2 and does not extend to other VLMs.
+
+```py
+dpo_dataset_dict = {
+    'images': [
+        [Image.open('beach.jpg')],
+        [Image.open('street.jpg')],
+    ],
+    'prompt': [
+        'The image <image> shows',
+        '<image> The image depicts',
+    ],
+    'chosen': [
+        'a sunny beach with palm trees.',
+        'a busy street with several cars and buildings.',
+    ],
+    'rejected': [
+        'a snowy mountain with skiers.',
+        'a calm countryside with green fields.',
+    ],
+}
+```
+
 ## Expected model format
-The DPO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
+The DPO trainer expects a model of `AutoModelForCausalLM` or `AutoModelForVision2Seq`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
 
 ## Using the `DPOTrainer`
 
@@ -86,7 +111,7 @@ dpo_trainer = DPOTrainer(
     model_ref,
     args=training_args,
     train_dataset=train_dataset,
-    tokenizer=tokenizer,
+    tokenizer=tokenizer,  # for visual language models, use tokenizer=processor instead
 )
 ```
 After this one can then call:

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
@@ -48,6 +48,19 @@
     --use_peft \
     --lora_r=16 \
     --lora_alpha=16
+
+# vision with peft:
+accelerate launch examples/scripts/dpo.py \
+    --dataset_name HuggingFaceH4/rlaif-v_formatted \
+    --model_name_or_path HuggingFaceM4/idefics2-8b \
+    --output_dir dpo_idefics_rlaif-v \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --learning_rate 1e-5 \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --use_peft \
+    --lora_target_modules=all-linear
 """
 
 import logging
@@ -58,6 +71,7 @@
 TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
 
 from trl.commands.cli_utils import DPOScriptArguments, init_zero_verbose, TrlParser
+from accelerate import PartialState
 
 if TRL_USE_RICH:
     init_zero_verbose()
@@ -68,7 +82,7 @@
 
 import torch
 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
 
 from trl import (
     DPOConfig,
@@ -112,13 +126,25 @@
         device_map=get_kbit_device_map() if quantization_config is not None else None,
         quantization_config=quantization_config,
     )
-    model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    is_vision_model = model_config.model_name_or_path in ["HuggingFaceM4/idefics2-8b"]
+    if is_vision_model:
+        model = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     peft_config = get_peft_config(model_config)
     if peft_config is None:
-        model_ref = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+        if is_vision_model:
+            model_ref = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+        else:
+            model_ref = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
     else:
         model_ref = None
-    tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
+    if is_vision_model:
+        processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=True)
+        tokenizer = processor.tokenizer
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
+    processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=False)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     if tokenizer.chat_template is None:
@@ -148,16 +174,40 @@
             ds[key] = ds[key].select(range(50))
 
     def process(row):
-        row["prompt"] = tokenizer.apply_chat_template(row["chosen"][:-1], tokenize=False)
-        row["chosen"] = tokenizer.apply_chat_template([row["chosen"][-1]], tokenize=False)
-        row["rejected"] = tokenizer.apply_chat_template([row["rejected"][-1]], tokenize=False)
+        # The prompt can be either a string or a list. In some datasets, the prompt is just a common string
+        # for both rejected and chosen (already included in chosen and rejected) and is not meant to be used
+        # separately. In other datasets, the prompt is intended to be used as a prefix for rejected and chosen,
+        # and in such cases, it is properly formatted as a list with keys "role" and "content".
+        # Example 1:
+        # row = {"prompt": "What does detox mean?",
+        #        "chosen": [{"content": "What does detox mean?", "role": "user"}, {"content": "It means to get rid of the toxins.", "role": "assistant"}],
+        #        "rejected": [{"content": "What does detox mean?", "role": "assistant"}, {"content": "I don't know.", "role": "assistant"}]}
+        # Example 2:
+        # row = {"prompt": [{"content": "What does detox mean?", "role": "user"}],
+        #        "chosen": [{"content": "It means to get rid of the toxins.", "role": "assistant"}],
+        #        "rejected": [{"content": "I don't know.", "role": "assistant"}]}
+        if is_vision_model:
+            apply_chat_template = processor.apply_chat_template
+        else:
+            apply_chat_template = tokenizer.apply_chat_template
+
+        if "prompt" in row and isinstance(row["prompt"], list):
+            row["prompt"] = apply_chat_template(row["prompt"], tokenize=False)
+
+        row["chosen"] = apply_chat_template(row["chosen"], tokenize=False)
+        row["rejected"] = apply_chat_template(row["rejected"], tokenize=False)
+
+        if "images" in row:
+            for idx, img in enumerate(row["images"]):  # Resize each image so the largest side is 640 pixels
+                ratio = min(1.0, 640 / max(img.size))
+                new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
+                row["images"][idx] = img.resize(new_size)
+            row["images"] = row["images"]
+
         return row
 
-    ds = ds.map(
-        process,
-        num_proc=multiprocessing.cpu_count(),
-        load_from_cache_file=False,
-    )
+    with PartialState().local_main_process_first():
+        ds = ds.map(process, num_proc=multiprocessing.cpu_count())
     train_dataset = ds[args.dataset_train_split]
     eval_dataset = ds[args.dataset_test_split]
 
@@ -171,7 +221,7 @@ def process(row):
             args=training_args,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
-            tokenizer=tokenizer,
+            tokenizer=processor if is_vision_model else tokenizer,
             peft_config=get_peft_config(model_config),
             callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
         )

diff --git a/examples/scripts/vdpo.py b/examples/scripts/vdpo.py
@@ -0,0 +1,186 @@
+# flake8: noqa
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+accelerate launch examples/scripts/vdpo.py \
+    --dataset_name HuggingFaceH4/rlaif-v_formatted \
+    --model_name_or_path HuggingFaceM4/idefics2-8b \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --learning_rate 1e-5 \
+    --logging_steps 5 \
+    --output_dir dpo_idefics_rlaif-v \
+    --push_to_hub --hub_model_id HuggingFaceH4/idefics2-8b-dpo-rlaif-v \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --logging_first_step \
+    --no_remove_unused_columns \
+    --dataset_num_proc 50 \
+    --dataload_num_workers 16 \
+    --use_peft \
+    --lora_target_modules=all-linear
+"""
+
+import logging
+import multiprocessing
+import os
+from contextlib import nullcontext
+
+TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
+
+from trl.commands.cli_utils import DPOScriptArguments, init_zero_verbose, TrlParser
+from accelerate import PartialState
+
+if TRL_USE_RICH:
+    init_zero_verbose()
+    FORMAT = "%(message)s"
+
+    from rich.console import Console
+    from rich.logging import RichHandler
+
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+from trl import (
+    DPOConfig,
+    DPOTrainer,
+    ModelConfig,
+    RichProgressCallback,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+
+
+if TRL_USE_RICH:
+    logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()], level=logging.INFO)
+
+
+if __name__ == "__main__":
+    parser = TrlParser((DPOScriptArguments, DPOConfig, ModelConfig))
+    args, training_args, model_config = parser.parse_args_and_config()
+
+    # Force use our print callback
+    if TRL_USE_RICH:
+        training_args.disable_tqdm = True
+        console = Console()
+
+    ################
+    # Model & Tokenizer
+    ################
+    torch_dtype = (
+        model_config.torch_dtype
+        if model_config.torch_dtype in ["auto", None]
+        else getattr(torch, model_config.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_config)
+
+    model_kwargs = dict(
+        revision=model_config.model_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        attn_implementation=model_config.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    model = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    peft_config = get_peft_config(model_config)
+    if peft_config is None:
+        model_ref = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    else:
+        model_ref = None
+    processor = AutoProcessor.from_pretrained(model_config.model_name_or_path, do_image_splitting=False)
+    tokenizer = processor.tokenizer
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if tokenizer.chat_template is None:
+        tokenizer.chat_template = "{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\n\n'}}{% endfor %}{{ eos_token }}"
+    if args.ignore_bias_buffers:
+        # torch distributed hack
+        model._ddp_params_and_buffers_to_ignore = [
+            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
+        ]
+
+    ################
+    # Optional rich context managers
+    ###############
+    init_context = nullcontext() if not TRL_USE_RICH else console.status("[bold green]Initializing the DPOTrainer...")
+    save_context = (
+        nullcontext()
+        if not TRL_USE_RICH
+        else console.status(f"[bold green]Training completed! Saving the model to {training_args.output_dir}")
+    )
+
+    ################
+    # Dataset
+    ################
+    ds = load_dataset(args.dataset_name)
+    if args.sanity_check:
+        for key in ds:
+            ds[key] = ds[key].select(range(50))
+
+    def process(row):
+        # The prompt can be either a string or a list. In some datasets, the prompt is just a common string
+        # for both rejected and chosen (already included in chosen and rejected) and is not meant to be used
+        # separately. In other datasets, the prompt is intended to be used as a prefix for rejected and chosen,
+        # and in such cases, it is properly formatted as a list with keys "role" and "content".
+        # Example 1:
+        # row = {"prompt": "What does detox mean?",
+        #        "chosen": [{"content": "What does detox mean?", "role": "user"}, {"content": "It means to get rid of the toxins.", "role": "assistant"}],
+        #        "rejected": [{"content": "What does detox mean?", "role": "assistant"}, {"content": "I don't know.", "role": "user"}]}
+        # Example 2:
+        # row = {"prompt": [{"content": "What does detox mean?", "role": "user"}],
+        #        "chosen": [{"content": "It means to get rid of the toxins.", "role": "assistant"}],
+        #        "rejected": [{"content": "I don't know.", "role": "user"}]}
+        if "prompt" in row and isinstance(row["prompt"], list):
+            row["prompt"] = processor.apply_chat_template(row["prompt"], tokenize=False)
+
+        row["chosen"] = processor.apply_chat_template(row["chosen"], tokenize=False)
+        row["rejected"] = processor.apply_chat_template(row["rejected"], tokenize=False)
+
+        if "images" in row:
+            for idx, img in enumerate(row["images"]):  # Resize each image so the largest side is 640 pixels
+                ratio = min(1.0, 640 / max(img.size))
+                new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
+                row["images"][idx] = img.resize(new_size)
+            row["images"] = row["images"]
+
+        return row
+
+    with PartialState().local_main_process_first():
+        ds = ds.map(process, num_proc=multiprocessing.cpu_count())
+    train_dataset = ds[args.dataset_train_split]
+    eval_dataset = ds[args.dataset_test_split]
+
+    ################
+    # Training
+    ################
+    with init_context:
+        trainer = DPOTrainer(
+            model,
+            model_ref,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=processor,
+            peft_config=get_peft_config(model_config),
+            callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
+        )
+
+    trainer.train()
+    trainer.push_to_hub
+    with save_context:
+        trainer.save_model(training_args.output_dir)
diff --git a/setup.py b/setup.py
@@ -69,7 +69,16 @@
     "tyro>=0.5.11",
 ]
 EXTRAS = {
-    "test": ["parameterized", "pytest", "pytest-xdist", "accelerate", "pytest-cov", "pytest-xdist", "scikit-learn"],
+    "test": [
+        "parameterized",
+        "pytest",
+        "pytest-xdist",
+        "accelerate",
+        "pytest-cov",
+        "pytest-xdist",
+        "scikit-learn",
+        "Pillow",
+    ],
     "peft": ["peft>=0.4.0"],
     "diffusers": ["diffusers>=0.18.0"],
     "deepspeed": ["deepspeed>=0.9.5"],