From f069a5350fa055fd38be707146246b4a82b82fc8 Mon Sep 17 00:00:00 2001 From: zhengzhonghui Date: Tue, 13 Aug 2024 12:58:36 +0800 Subject: [PATCH] Update qwen && baichuan benchmark config (#8920) --- paddlenlp/transformers/qwen/modeling.py | 23 ++++++++++++++----- ...bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh} | 2 +- .../pretrain-baichun2_13b-config.json | 14 +++++------ .../pretrain-baichun2_7b-config.json | 6 ++--- ...bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh} | 16 ++++++------- ...bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh} | 14 +++++------ 6 files changed, 43 insertions(+), 32 deletions(-) rename tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/{baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP2_PP2_VPP1_Sharding8_Stage1.sh => baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh} (94%) rename tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/{qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh => qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh} (76%) rename tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/{qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh => qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh} (78%) diff --git a/paddlenlp/transformers/qwen/modeling.py b/paddlenlp/transformers/qwen/modeling.py index 54e3b4849380..88602e5d8a82 100755 --- a/paddlenlp/transformers/qwen/modeling.py +++ b/paddlenlp/transformers/qwen/modeling.py @@ -13,6 +13,7 @@ # limitations under the License. import math +import os import warnings from functools import partial from typing import List @@ -83,6 +84,11 @@ def swiglu(x, y=None): fused_rotary_position_embedding = None +def get_use_casual_mask(): + """Get the value of the 'USE_CASUAL_MASK' environment variable.""" + return os.getenv("USE_CASUAL_MASK", "False") == "True" + + def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True): is_fleet_init = True tensor_parallel_degree = 1 @@ -803,13 +809,18 @@ def forward( inputs_embeds = ScatterOp.apply(inputs_embeds) hidden_states = inputs_embeds - + use_casual_mask = get_use_casual_mask() # bool 4D mask - attention_mask = self.get_masks(input_shape[0], input_shape[1], past_length, padding_mask=attention_mask) - zero = paddle.zeros(attention_mask.shape, dtype=hidden_states.dtype) - neg_inf = paddle.full_like(attention_mask, paddle.finfo(hidden_states.dtype).min, dtype=hidden_states.dtype) - # dtype 4D mask - attention_mask = paddle.where(attention_mask, zero, neg_inf) + if use_casual_mask is None: + attention_mask = None + else: + attention_mask = self.get_masks(input_shape[0], input_shape[1], past_length, padding_mask=attention_mask) + zero = paddle.zeros(attention_mask.shape, dtype=hidden_states.dtype) + neg_inf = paddle.full_like( + attention_mask, paddle.finfo(hidden_states.dtype).min, dtype=hidden_states.dtype + ) + # dtype 4D mask + attention_mask = paddle.where(attention_mask, zero, neg_inf) hidden_states = self.drop(hidden_states) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP2_PP2_VPP1_Sharding8_Stage1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh similarity index 94% rename from tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP2_PP2_VPP1_Sharding8_Stage1.sh rename to tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh index 32f2a5c67acf..e4cb99e2eedc 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP2_PP2_VPP1_Sharding8_Stage1.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/N4C32/baichuan-inc-Baichun2-13b_pretrain_bs32_bf16_DP1_MP4_PP1_VPP1_Sharding8_Stage1.sh @@ -13,7 +13,7 @@ # limitations under the License. param="model_item=baichuan-inc-Baichun2-13b_pretrain " -param+="run_mode=DP1_MP2_PP2_VPP1_Sharding8_Stage1 " +param+="run_mode=DP1_MP4_PP1_VPP1_Sharding8_Stage1 " param+="device_num=N4C32 " param+="global_batch_size=32 " param+="nnodes=4 " diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json index 7ed6a313c22d..e0e4238a71ce 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json @@ -5,16 +5,16 @@ "output_dir": "./output/baichun2-13b_pretrain_ckpts", "split": "949,50,1", "max_seq_length": 4096, - "gradient_accumulation_steps": 4, - "tensor_parallel_degree": 2, - "pipeline_parallel_degree": 2, + "gradient_accumulation_steps": 2, + "tensor_parallel_degree": 4, + "pipeline_parallel_degree": 1, "virtual_pp_degree": 1, "sequence_parallel": 1, "sharding_parallel_degree": 8, "sharding": "stage1", - "pipeline_parallel_config": "enable_delay_scale_loss enable_sharding_comm_overlap enable_release_grads ", - "tensor_parallel_config": "enable_delay_scale_loss enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add", - "per_device_train_batch_size": 1, + "pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ", + "tensor_parallel_config": "enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add", + "per_device_train_batch_size": 2, "use_flash_attention": true, "use_fused_rms_norm": true, "fuse_attention_qkv": true, @@ -42,5 +42,5 @@ "pp_recompute_interval": 1, "device": "gpu", "amp_master_grad": true, - "sharding_parallel_config": "split_param enable_stage1_overlap" + "sharding_parallel_config": "split_param enable_stage1_overlap enable_stage1_allgather_overlap" } diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_7b/pretrain-baichun2_7b-config.json b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_7b/pretrain-baichun2_7b-config.json index ceb383b859c3..8c1b4a0c61ef 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_7b/pretrain-baichun2_7b-config.json +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_7b/pretrain-baichun2_7b-config.json @@ -12,8 +12,8 @@ "sequence_parallel": 1, "sharding_parallel_degree": 16, "sharding": "stage1", - "pipeline_parallel_config": "enable_delay_scale_loss enable_sharding_comm_overlap enable_release_grads ", - "tensor_parallel_config": "enable_delay_scale_loss enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add", + "pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ", + "tensor_parallel_config": "enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add", "per_device_train_batch_size": 2, "use_flash_attention": true, "use_fused_rms_norm": true, @@ -42,5 +42,5 @@ "pp_recompute_interval": 1, "device": "gpu", "amp_master_grad": true, - "sharding_parallel_config": "split_param enable_stage1_overlap" + "sharding_parallel_config": "split_param enable_stage1_overlap enable_stage1_allgather_overlap" } diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh similarity index 76% rename from tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh rename to tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh index 001ccbf0a07f..bf6cd5a603b1 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP1-PP4-sharding8-mbs1-acc4.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh @@ -16,30 +16,30 @@ param="model_name_or_path=qwen/qwen-14b " param+="per_device_train_batch_size=1 " param+="data_parallel_degree=1 " -param+="tensor_parallel_degree=1 " -param+="pipeline_parallel_degree=4 " +param+="tensor_parallel_degree=2 " +param+="pipeline_parallel_degree=1 " param+="virtual_pp_degree=1 " param+="sequence_parallel=0 " -param+="sharding_parallel_degree=8 " +param+="sharding_parallel_degree=16 " param+="sharding=stage1 " param+="recompute=0 " param+="recompute_granularity=full_attn " -param+="run_mode=MP1-PP4-sharding8-mbs1-acc4 " +param+="run_mode=MP2-PP1-sharding16-mbs1-acc2 " param+="device_num=N4C32 " param+="global_batch_size=32 " param+="model_item=qwen-qwen-14b_seqlen4096_pretrain " param+="max_steps=100 " -param+="gradient_accumulation_steps=4 " +param+="gradient_accumulation_steps=2 " param+="pp_recompute_interval=1 " -param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, " +param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, " #多机新添加的参数 -param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, " +param+="pipeline_parallel_config=enable_sharding_comm_overlap,enable_release_grads, " param+="max_seq_length=4096 " param+="min_learning_rate=0.000005 " param+="save_steps=5000 " param+="eval_steps=1000 " param+="scale_loss=1024 " -param+="sharding_parallel_config=split_param,enable_stage1_overlap, " +param+="sharding_parallel_config=split_param,enable_stage1_overlap,enable_stage1_allgather_overlap " cd ./tests diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh similarity index 78% rename from tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh rename to tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh index 69882b131a52..4a9ed7f11e82 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs2-acc1.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh @@ -14,32 +14,32 @@ param="model_name_or_path=qwen/qwen-7b " -param+="per_device_train_batch_size=2 " +param+="per_device_train_batch_size=1 " param+="data_parallel_degree=1 " -param+="tensor_parallel_degree=2 " +param+="tensor_parallel_degree=1 " param+="pipeline_parallel_degree=1 " param+="virtual_pp_degree=1 " param+="sequence_parallel=1 " -param+="sharding_parallel_degree=16 " +param+="sharding_parallel_degree=32 " param+="sharding=stage1 " param+="recompute=0 " param+="recompute_granularity=full_attn " -param+="run_mode=MP2-PP1-sharding16-mbs2-acc1 " +param+="run_mode=MP1-PP1-sharding32-mbs1-acc1 " param+="device_num=N4C32 " param+="global_batch_size=32 " param+="model_item=qwen-qwen-7b_seqlen4096_pretrain " param+="max_steps=100 " param+="gradient_accumulation_steps=1 " param+="pp_recompute_interval=1 " -param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add " +param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add " #多机新添加的参数 -param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads " +param+="pipeline_parallel_config=enable_sharding_comm_overlap,enable_release_grads " param+="max_seq_length=4096 " param+="min_learning_rate=0.000005 " param+="save_steps=5000 " param+="eval_steps=1000 " param+="scale_loss=1024 " -param+="sharding_parallel_config=split_param,enable_stage1_overlap " +param+="sharding_parallel_config=split_param,enable_stage1_overlap,enable_stage1_allgather_overlap " cd ./tests