volcengine · eric-haibin-lin · Apr 4, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/examples/data_preprocess/multiturn.py b/examples/data_preprocess/multiturn.py
@@ -0,0 +1,131 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Create a simple multi-turn dataset for testing
+"""
+
+import os
+import pandas as pd
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_dir', default='~/data/multiturn')
+    parser.add_argument('--hdfs_dir', default=None)
+    args = parser.parse_args()
+
+    # Create example conversations
+    conversations = []
+
+    # Conversation 1
+    conversations.append({
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "What is the capital of France?"
+        }, {
+            "role": "assistant",
+            "content": "The capital of France is Paris."
+        }, {
+            "role": "user",
+            "content": "And what about Germany?"
+        }, {
+            "role": "assistant",
+            "content": "The capital of Germany is Berlin."
+        }]
+    })
+
+    # Conversation 2
+    conversations.append({
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "Can you explain quantum computing?"
+        }, {
+            "role":
+                "assistant",
+            "content":
+                "Quantum computing is a type of computing that uses quantum-mechanical phenomena, such as superposition and entanglement, to perform operations on data."
+        }, {
+            "role": "user",
+            "content": "How is it different from classical computing?"
+        }, {
+            "role":
+                "assistant",
+            "content":
+                "Classical computing uses bits that are either 0 or 1, while quantum computing uses quantum bits or qubits that can exist in multiple states simultaneously due to superposition."
+        }]
+    })
+
+    # Conversation 3
+    conversations.append({
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "Write a simple Python function to calculate factorial."
+        }, {
+            "role":
+                "assistant",
+            "content":
+                "```python\ndef factorial(n):\n    if n == 0 or n == 1:\n        return 1\n    else:\n        return n * factorial(n-1)\n```\n\nThis is a recursive function to calculate the factorial of a number."
+        }, {
+            "role": "user",
+            "content": "Can you make it iterative instead?"
+        }, {
+            "role":
+                "assistant",
+            "content":
+                "```python\ndef factorial(n):\n    result = 1\n    for i in range(1, n+1):\n        result *= i\n    return result\n```\n\nThis is an iterative version of the factorial function."
+        }]
+    })
+
+    # Create train and test datasets
+    train_data = conversations[:2]  # First 2 conversations for training
+    test_data = conversations[2:]  # Last conversation for testing
+
+    # Create output directory
+    local_dir = os.path.expanduser(args.local_dir)
+    os.makedirs(local_dir, exist_ok=True)
+
+    # Save to parquet files
+    train_df = pd.DataFrame(train_data)
+    test_df = pd.DataFrame(test_data)
+
+    train_df.to_parquet(os.path.join(local_dir, 'train.parquet'))
+    test_df.to_parquet(os.path.join(local_dir, 'test.parquet'))
+
+    # Handle HDFS if specified
+    if args.hdfs_dir is not None:
+        try:
+            from verl.utils.hdfs_io import copy, makedirs
+            makedirs(args.hdfs_dir)
+            copy(src=local_dir, dst=args.hdfs_dir)
+        except ImportError:
+            print("Warning: HDFS support not available. Skipping HDFS copy.")
+
+    # Print statistics
+    print(f"Train dataset size: {len(train_df)}")
+    print(f"Test dataset size: {len(test_df)}")
+    print(f"Data saved to {local_dir}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/sft/multiturn/run_qwen_05_sp2.sh b/examples/sft/multiturn/run_qwen_05_sp2.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -x
+
+if [ "$#" -lt 2 ]; then
+    echo "Usage: run_qwen_05_sp2.sh <nproc_per_node> <save_path> [other_configs...]"
+    exit 1
+fi
+
+nproc_per_node=$1
+save_path=$2
+
+# Shift the arguments so $@ refers to the rest
+shift 2
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
+     -m verl.trainer.fsdp_sft_trainer \
+    data.train_files=$HOME/data/multiturn/train.parquet \
+    data.val_files=$HOME/data/multiturn/test.parquet \
+    data.multiturn.enable=true \
+    data.multiturn.messages_key=messages \
+    data.micro_batch_size=4 \
+    model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
+    trainer.default_local_dir=$save_path \
+    trainer.project_name=multiturn-sft \
+    trainer.experiment_name=multiturn-sft-qwen-2.5-0.5b-instruct-sp2 \
+    trainer.logger=['console'] \
+    trainer.total_training_steps=1 \
+    trainer.default_hdfs_dir=null $@ \
+    ulysses_sequence_parallel_size=2 \
+    use_remove_padding=true
diff --git a/tests/sft/run_sft_multiturn.sh b/tests/sft/run_sft_multiturn.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -x
+
+if [ "$#" -lt 2 ]; then
+    echo "Usage: run_qwen_05_sp2.sh <nproc_per_node> <save_path> [other_configs...]"
+    exit 1
+fi
+
+nproc_per_node=$1
+save_path=$2
+
+# Shift the arguments so $@ refers to the rest
+shift 2
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
+     -m verl.trainer.fsdp_sft_trainer \
+    data.train_files=$HOME/data/multiturn/train.parquet \
+    data.val_files=$HOME/data/multiturn/test.parquet \
+    data.multiturn.enable=true \
+    data.multiturn.messages_key=messages \
+    data.micro_batch_size=4 \
+    model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
+    trainer.default_local_dir=$save_path \
+    trainer.project_name=multiturn-sft \
+    trainer.experiment_name=multiturn-sft-qwen-2.5-0.5b-instruct-sp2 \
+    trainer.logger=['console'] \
+    trainer.total_training_steps=1 \
+    trainer.default_hdfs_dir=null $@ \
+    ulysses_sequence_parallel_size=2 \
+    use_remove_padding=true
diff --git a/tests/verl/utils/dataset/test_multiturn_sft_dataset.py b/tests/verl/utils/dataset/test_multiturn_sft_dataset.py
@@ -0,0 +1,193 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test the MultiTurnSFTDataset implementation
+"""
+import os
+import pandas as pd
+import torch
+from transformers import AutoTokenizer
+from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset
+
+
+def test_multiturn_sft_dataset():
+    print("Starting test...")
+    # Create a temporary parquet file with test data
+    test_data = {
+        'messages': [[{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "What is 2+2?"
+        }, {
+            "role": "assistant",
+            "content": "2+2 equals 4."
+        }, {
+            "role": "user",
+            "content": "And what is 4+4?"
+        }, {
+            "role": "assistant",
+            "content": "4+4 equals 8."
+        }],
+                     [{
+                         "role": "system",
+                         "content": "You are a helpful assistant."
+                     }, {
+                         "role": "user",
+                         "content": "Tell me a joke."
+                     }, {
+                         "role": "assistant",
+                         "content": "Why did the chicken cross the road?"
+                     }, {
+                         "role": "user",
+                         "content": "Why?"
+                     }, {
+                         "role": "assistant",
+                         "content": "To get to the other side!"
+                     }]]
+    }
+
+    # Create test directory if it doesn't exist
+    os.makedirs('test_data', exist_ok=True)
+    test_file = 'test_data/test.parquet'
+
+    # Save test data to parquet
+    df = pd.DataFrame(test_data)
+    df.to_parquet(test_file)
+
+    # Initialize tokenizer and dataset
+    tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-7B-Instruct')
+    config = {'max_length': 512, 'truncation': 'error', 'multiturn': {'messages_key': 'messages'}}
+    dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=config)
+
+    # Test 1: Dataset Length
+    assert len(dataset) == 2, f"Expected dataset length 2, got {len(dataset)}"
+
+    # Get items for testing
+    item0 = dataset[0]  # Math conversation
+    item1 = dataset[1]  # Joke conversation
+
+    # Test 2: Required Keys and Types
+    required_keys = ['input_ids', 'attention_mask', 'position_ids', 'loss_mask']
+    for key in required_keys:
+        assert key in item0, f"Missing key {key} in dataset item"
+        assert isinstance(item0[key], torch.Tensor), f"Expected torch.Tensor for {key}"
+        assert item0[key].dtype == torch.long, f"Expected torch.long for {key}, got {item0[key].dtype}"
+
+    # Test 3: Shape Consistency
+    assert item0['loss_mask'].shape == item0['input_ids'].shape, \
+        "Loss mask shape doesn't match input_ids shape"
+    assert item0['attention_mask'].shape == item0['input_ids'].shape, \
+        "Attention mask shape doesn't match input_ids shape"
+    assert item0['position_ids'].shape == item0['input_ids'].shape, \
+        "Position IDs shape doesn't match input_ids shape"
+
+    # Test 4: Loss Mask Pattern - Math Conversation
+    loss_mask0 = item0['loss_mask']
+    input_ids0 = item0['input_ids']
+
+    # Find assistant response positions
+    assistant_positions0 = torch.where(loss_mask0 == 1)[0]
+    assert len(assistant_positions0) > 0, "No assistant positions found in loss mask"
+
+    # Decode and verify assistant responses
+    assistant_text0 = tokenizer.decode(input_ids0[loss_mask0 == 1])
+    print(f"Math conversation assistant text: {assistant_text0}")
+    assert "2+2 equals 4" in assistant_text0, "First assistant response not found"
+    assert "4+4 equals 8" in assistant_text0, "Second assistant response not found"
+
+    # Test 5: Loss Mask Pattern - Joke Conversation
+    loss_mask1 = item1['loss_mask']
+    input_ids1 = item1['input_ids']
+
+    # Find assistant response positions
+    assistant_positions1 = torch.where(loss_mask1 == 1)[0]
+    assert len(assistant_positions1) > 0, "No assistant positions found in loss mask"
+
+    # Decode and verify assistant responses
+    assistant_text1 = tokenizer.decode(input_ids1[loss_mask1 == 1])
+    print(f"Joke conversation assistant text: {assistant_text1}")
+    assert "chicken cross the road" in assistant_text1, "First assistant response not found"
+    assert "other side" in assistant_text1, "Second assistant response not found"
+
+    # Test 6: Attention Mask Pattern
+    attention_mask0 = item0['attention_mask']
+    sequence_length = torch.sum(attention_mask0)
+    assert sequence_length > 0, "No tokens marked as attended in attention mask"
+    assert torch.all(attention_mask0[:sequence_length] == 1), "Incorrect attention mask pattern"
+    if sequence_length < len(attention_mask0):
+        assert torch.all(attention_mask0[sequence_length:] == 0), "Padding not properly masked"
+
+    # Test 7: Position IDs Pattern
+    position_ids0 = item0['position_ids']
+    assert torch.equal(position_ids0[:sequence_length], torch.arange(sequence_length)), \
+        "Position IDs not sequential for non-padded tokens"
+    if sequence_length < len(position_ids0):
+        assert torch.all(position_ids0[sequence_length:] == 0), "Padding position IDs not zero"
+
+    # Test 8: Verify loss mask for assistant responses
+    # Get the full conversation text
+    full_text = tokenizer.decode(input_ids0)
+    print(f"\nFull conversation text:\n{full_text}")
+
+    # Get the assistant responses
+    assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 1])
+    print(f"\nAssistant responses (from loss mask):\n{assistant_text}")
+
+    # Verify that loss mask is set for all assistant responses
+    for msg in test_data['messages'][0]:  # First conversation
+        if msg['role'] == 'assistant':
+            # The content should appear in the masked text
+            assert msg['content'] in assistant_text, \
+                f"Assistant message '{msg['content']}' not found in masked text"
+
+            # The content should NOT appear in the non-masked text
+            non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
+            assert msg['content'] not in non_assistant_text, \
+                f"Assistant message '{msg['content']}' found in non-assistant text"
+
+    # Test 9: Verify non-assistant parts have loss_mask=0
+    # Get non-assistant text
+    non_assistant_text = tokenizer.decode(input_ids0[loss_mask0 == 0])
+    print(f"\nNon-assistant text (from loss mask):\n{non_assistant_text}")
+
+    # Verify that system and user messages are in the non-assistant text
+    for msg in test_data['messages'][0]:  # First conversation
+        if msg['role'] in ['system', 'user']:
+            assert msg['content'] in non_assistant_text, \
+                f"{msg['role'].title()} message '{msg['content']}' not found in non-assistant text"
+
+            # And verify they're NOT in the assistant text
+            assert msg['content'] not in assistant_text, \
+                f"{msg['role'].title()} message '{msg['content']}' found in assistant text"
+
+    # Test 10: Verify padding behavior
+    padding_config = {'max_length': 1024, 'truncation': 'error', 'multiturn': {'messages_key': 'messages'}}
+    small_dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=padding_config)
+    padded_item = small_dataset[0]
+
+    # Get actual sequence length (before padding)
+    actual_length = torch.sum(padded_item['attention_mask'])
+
+    # Verify padding tokens
+    assert torch.all(padded_item['input_ids'][actual_length:] == tokenizer.pad_token_id), \
+        "Padding tokens not set correctly"
+    assert torch.all(padded_item['attention_mask'][actual_length:] == 0), \
+        "Attention mask not set correctly for padding"
+    assert torch.all(padded_item['loss_mask'][actual_length:] == 0), \
+        "Loss mask not set correctly for padding"
+
+    print("All tests passed!")
+    print("Starting test...")