volcengine
diff --git a/‎examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh‎
Lines changed: 71 additions & 0 deletions b/‎examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎scripts/converter_hf_to_mcore.py‎
Lines changed: 73 additions & 23 deletions b/‎scripts/converter_hf_to_mcore.py‎
Lines changed: 73 additions & 23 deletions
diff --git a/‎verl/models/mcore/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎verl/models/mcore/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎verl/models/mcore/config_converter.py‎
Lines changed: 64 additions & 2 deletions b/‎verl/models/mcore/config_converter.py‎
Lines changed: 64 additions & 2 deletions
@@ -0,0 +1,71 @@
+set -x
+# 0. download the model
+huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat
+
+# 1. convert the model to mcore format
+# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
+HF_MODEL_PATH=/data/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
+DIST_CKPT_PATH=/data/mcore_ckpt/Qwen1.5-MoE-A2.7B-Chat
+python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
+
+# 2. run the script
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+train_files=$gsm8k_train_path
+test_files=$gsm8k_test_path
+
+NODES=4
+PP=2
+TP=4
+CP=1
+VLLM_TP=4
+
+# RAY_ADDRESS='auto' ray job submit --working-dir . -- 
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$HF_MODEL_PATH \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=$HF_MODEL_PATH \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='verl_megatron_gsm8k_examples' \
+    trainer.experiment_name='qwen1.5_moe_nochat' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=$NODES \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
+    critic.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
+    critic.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
+    actor_rollout_ref.ref.megatron.context_parallel_size=$CP \
+    critic.megatron.context_parallel_size=$CP \
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
+    critic.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    trainer.total_epochs=100 $@
+    
@@ -22,9 +22,11 @@
 from megatron.core import parallel_state as mpu
 from megatron.core.dist_checkpointing.serialization import StrictHandling
 from megatron.core.models.gpt.gpt_model import ModelType
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from transformers import AutoConfig, AutoModelForCausalLM
 
-from verl.utils.megatron_utils import convert_config, get_model
+from verl.models.mcore import hf_to_mcore_config
+from verl.utils.megatron_utils import get_model
 
 
 def _init_args():
@@ -51,6 +53,49 @@ def __init__(self):
         self.model = ModelConfig()
 
 
+def convert_checkpoint_from_transformers_to_megatron(hf_model, model, hf_config):
+    num_attention_heads = hf_config.num_attention_heads
+    hidden_dim = hf_config.hidden_size
+    head_dim = hidden_dim // num_attention_heads
+    with torch.no_grad():
+        model.embedding.word_embeddings.weight.copy_(hf_model.model.embed_tokens.weight)
+        for layer, hf_layer in zip(model.decoder.layers, hf_model.model.layers):
+            layer.self_attention.linear_qkv.layer_norm_weight.copy_(hf_layer.input_layernorm.weight)
+
+            q = hf_layer.self_attn.q_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim])
+            k = hf_layer.self_attn.k_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim])
+            v = hf_layer.self_attn.v_proj.weight.view([num_attention_heads, -1, head_dim, hidden_dim])
+            qkv = torch.cat([q, k, v], dim=1).view(-1, hidden_dim).contiguous()
+
+            q_bias = hf_layer.self_attn.q_proj.bias.view([num_attention_heads, -1])
+            k_bias = hf_layer.self_attn.k_proj.bias.view([num_attention_heads, -1])
+            v_bias = hf_layer.self_attn.v_proj.bias.view([num_attention_heads, -1])
+            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).view(-1).contiguous()
+
+            layer.self_attention.linear_qkv.weight.copy_(qkv)
+            layer.self_attention.linear_qkv.bias.copy_(qkv_bias)
+
+            layer.self_attention.linear_proj.weight.copy_(hf_layer.self_attn.o_proj.weight)
+            layer.pre_mlp_layernorm.weight.copy_(hf_layer.post_attention_layernorm.weight)
+
+            layer.mlp.router.weight.copy_(hf_layer.mlp.gate.weight)
+
+            for idx, hf_expert in enumerate(hf_layer.mlp.experts):
+                fc1_weight = torch.cat([hf_expert.gate_proj.weight, hf_expert.up_proj.weight])
+                layer.mlp.experts.linear_fc1._parameters[f"weight{idx}"].copy_(fc1_weight)
+                layer.mlp.experts.linear_fc2._parameters[f"weight{idx}"].copy_(hf_expert.down_proj.weight)
+
+            layer.mlp.shared_experts.gate_weight.copy_(hf_layer.mlp.shared_expert_gate.weight)
+            shared_fc1_weight = torch.cat(
+                [hf_layer.mlp.shared_expert.gate_proj.weight, hf_layer.mlp.shared_expert.up_proj.weight]
+            )
+            layer.mlp.shared_experts.linear_fc1.weight.copy_(shared_fc1_weight)
+            layer.mlp.shared_experts.linear_fc2.weight.copy_(hf_layer.mlp.shared_expert.down_proj.weight)
+
+        model.decoder.final_layernorm.weight.copy_(hf_model.model.norm.weight)
+        model.output_layer.weight.copy_(hf_model.lm_head.weight)
+
+
 def convert_hf_to_mcore(hf_model_path, output_path, test=False):
     os.makedirs(output_path, exist_ok=True)
     if len(os.listdir(output_path)) > 0 and not test:
@@ -69,21 +114,22 @@ def convert_hf_to_mcore(hf_model_path, output_path, test=False):
         context_parallel_size=1,
         expert_model_parallel_size=1,
     )
+    model_parallel_cuda_manual_seed(0)
 
     # init hf config
     hf_config = AutoConfig.from_pretrained(hf_model_path)
     print(hf_config)
-    megatron_config = MegatronConfig()
+
     cfg = Config()
     cfg.model.path = hf_model_path
-    tfconfig = convert_config(hf_config, megatron_config)
+    tfconfig = hf_to_mcore_config(hf_config, torch.bfloat16)
     tie_word_embeddings = getattr(hf_config, "tie_word_embeddings", False)
 
     # init megatron model
     def megatron_model_provider(pre_process, post_process):
-        from verl.utils.model import get_parallel_gptmodel_from_config
+        from verl.models.mcore import init_mcore_model
 
-        parallel_model = get_parallel_gptmodel_from_config(
+        parallel_model = init_mcore_model(
             tfconfig,
             hf_config,
             pre_process,
@@ -94,27 +140,31 @@ def megatron_model_provider(pre_process, post_process):
         return parallel_model
 
     model = get_model(
-        model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True
+        model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=False
     )
 
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
 
     # init hf model
-    hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path)
+    hf_model = AutoModelForCausalLM.from_pretrained(hf_model_path, torch_dtype=torch.bfloat16)
     ref_state_dict = hf_model.state_dict()
 
     # load hf state dict to megatron model
-    from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
-
-    load_state_dict_to_megatron_gptmodel(
-        state_dict=ref_state_dict,
-        wrapped_models=model,
-        config=hf_config,
-        params_dtype=torch.bfloat16,
-        is_value_model=False,
-    )
-    ssd = model[0].module.module.sharded_state_dict()
+    if "Qwen2MoeForCausalLM" in hf_config.architectures:
+        convert_checkpoint_from_transformers_to_megatron(hf_model, model[0].module, hf_config)
+    else:
+        from verl.models.mcore.loader import load_state_dict_to_megatron_gptmodel
+
+        load_state_dict_to_megatron_gptmodel(
+            state_dict=ref_state_dict,
+            wrapped_models=model,
+            config=hf_config,
+            params_dtype=torch.bfloat16,
+            is_value_model=False,
+        )
+
+    ssd = model[0].module.sharded_state_dict()
     del ref_state_dict, hf_model
 
     # save megatron model
@@ -126,11 +176,11 @@ def megatron_model_provider(pre_process, post_process):
         model_test = get_model(
             model_provider_func=megatron_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True
         )
-        ssd2 = model_test[0].module.module.sharded_state_dict()
+        ssd2 = model_test[0].module.sharded_state_dict()
         dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.ASSUME_OK_UNEXPECTED)
 
-        sd = model[0].module.module.state_dict()
-        sd2 = model_test[0].module.module.state_dict()
+        sd = model[0].module.state_dict()
+        sd2 = model_test[0].module.state_dict()
         for k in sd.keys():
             if sd[k] is None:
                 continue
@@ -163,11 +213,11 @@ def megatron_value_model_provider(pre_process, post_process):
             model_type=ModelType.encoder_or_decoder,
             wrap_with_ddp=True,
         )
-        ssd2 = model_value[0].module.module.sharded_state_dict()
+        ssd2 = model_value[0].module.sharded_state_dict()
         dist_checkpointing.load(ssd2, output_path, strict=StrictHandling.IGNORE_ALL)
 
-        sd = model[0].module.module.state_dict()
-        sd2 = model_value[0].module.module.state_dict()
+        sd = model[0].module.state_dict()
+        sd2 = model_value[0].module.state_dict()
         for k in sd.keys():
             if sd[k] is None:
                 continue
 
@@ -13,6 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .registry import get_mcore_forward_fn, hf_to_mcore_config, init_mcore_model
+from .registry import get_mcore_forward_fn, get_mcore_weight_converter, hf_to_mcore_config, init_mcore_model
 
-__all__ = ["init_mcore_model", "hf_to_mcore_config", "get_mcore_forward_fn"]
+__all__ = ["init_mcore_model", "hf_to_mcore_config", "get_mcore_forward_fn", "get_mcore_weight_converter"]
@@ -66,8 +66,70 @@ def hf_to_mcore_config_dense(hf_config: PretrainedConfig, dtype: torch.dtype) ->
 
 
 def hf_to_mcore_config_qwen2moe(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig:
-    # Qwen2MoeForCausalLM
-    raise NotImplementedError("Qwen2MoeForCausalLM is not supported yet")
+    from megatron.core import parallel_state as mpu
+
+    overlap_p2p_comm = (
+        mpu.get_virtual_pipeline_model_parallel_world_size() is not None
+        and mpu.get_virtual_pipeline_model_parallel_world_size() > 1
+    )
+    batch_p2p_comm = False
+    transformer_config = TransformerConfig(
+        num_layers=hf_config.num_hidden_layers,
+        hidden_size=hf_config.hidden_size,
+        num_attention_heads=hf_config.num_attention_heads,
+        num_query_groups=hf_config.num_key_value_heads,
+        attention_dropout=hf_config.attention_dropout,
+        hidden_dropout=getattr(hf_config, "hidden_dropout", 0.0),
+        activation_func=F.silu,
+        normalization="RMSNorm",
+        gated_linear_unit=True,
+        use_cpu_initialization=False,
+        add_bias_linear=False,
+        pipeline_dtype=dtype,
+        params_dtype=dtype,
+        variable_seq_lengths=True,
+        masked_softmax_fusion=True,
+        attention_backend=AttnBackend.flash,
+        # attention_backend=AttnBackend.fused,
+        bf16=dtype is torch.bfloat16,
+        layernorm_epsilon=hf_config.rms_norm_eps,
+        ffn_hidden_size=hf_config.intermediate_size,
+        # parallel config
+        tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
+        pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
+        virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(),
+        context_parallel_size=mpu.get_context_parallel_world_size(),
+        overlap_p2p_comm=overlap_p2p_comm,
+        batch_p2p_comm=batch_p2p_comm,
+        sequence_parallel=mpu.get_tensor_model_parallel_world_size() > 1,
+        # moe specific
+        moe_ffn_hidden_size=hf_config.moe_intermediate_size,
+        moe_token_dispatcher_type="alltoall",
+        moe_router_bias_update_rate=0.001,
+        moe_router_topk=hf_config.num_experts_per_tok,
+        num_moe_experts=hf_config.num_experts,
+        moe_shared_expert_intermediate_size=hf_config.shared_expert_intermediate_size,
+        moe_aux_loss_coeff=hf_config.router_aux_loss_coef,
+        # moe_aux_loss_coeff=0.0,
+        moe_router_load_balancing_type="aux_loss",
+        moe_shared_expert_overlap=True,
+        # moe_permute_fusion=True, # need TE 2.1+
+        moe_grouped_gemm=True,
+        moe_router_score_function="softmax",
+        # # mcore 0.12 moe
+        # moe_router_dtype="fp64",
+        # disable_bf16_reduced_precision_matmul=True,
+        # other
+        # deallocate_pipeline_outputs=True,
+        # gradient_accumulation_fusion=True,
+        persist_layer_norm=True,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        # qwen specific
+        moe_router_pre_softmax=True,
+        add_qkv_bias=True,
+    )
+    return transformer_config
 
 
 def hf_to_mcore_config_dpskv3(hf_config: PretrainedConfig, dtype: torch.dtype) -> TransformerConfig: