Skip to content

Commit

Permalink
Update qwen && baichuan benchmark config (#8920)
Browse files Browse the repository at this point in the history
  • Loading branch information
deepllz authored Aug 13, 2024
1 parent 30fc639 commit f069a53
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 32 deletions.
23 changes: 17 additions & 6 deletions paddlenlp/transformers/qwen/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import math
import os
import warnings
from functools import partial
from typing import List
Expand Down Expand Up @@ -83,6 +84,11 @@ def swiglu(x, y=None):
fused_rotary_position_embedding = None


def get_use_casual_mask():
"""Get the value of the 'USE_CASUAL_MASK' environment variable."""
return os.getenv("USE_CASUAL_MASK", "False") == "True"


def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True):
is_fleet_init = True
tensor_parallel_degree = 1
Expand Down Expand Up @@ -803,13 +809,18 @@ def forward(
inputs_embeds = ScatterOp.apply(inputs_embeds)

hidden_states = inputs_embeds

use_casual_mask = get_use_casual_mask()
# bool 4D mask
attention_mask = self.get_masks(input_shape[0], input_shape[1], past_length, padding_mask=attention_mask)
zero = paddle.zeros(attention_mask.shape, dtype=hidden_states.dtype)
neg_inf = paddle.full_like(attention_mask, paddle.finfo(hidden_states.dtype).min, dtype=hidden_states.dtype)
# dtype 4D mask
attention_mask = paddle.where(attention_mask, zero, neg_inf)
if use_casual_mask is None:
attention_mask = None
else:
attention_mask = self.get_masks(input_shape[0], input_shape[1], past_length, padding_mask=attention_mask)
zero = paddle.zeros(attention_mask.shape, dtype=hidden_states.dtype)
neg_inf = paddle.full_like(
attention_mask, paddle.finfo(hidden_states.dtype).min, dtype=hidden_states.dtype
)
# dtype 4D mask
attention_mask = paddle.where(attention_mask, zero, neg_inf)

hidden_states = self.drop(hidden_states)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

param="model_item=baichuan-inc-Baichun2-13b_pretrain "
param+="run_mode=DP1_MP2_PP2_VPP1_Sharding8_Stage1 "
param+="run_mode=DP1_MP4_PP1_VPP1_Sharding8_Stage1 "
param+="device_num=N4C32 "
param+="global_batch_size=32 "
param+="nnodes=4 "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
"output_dir": "./output/baichun2-13b_pretrain_ckpts",
"split": "949,50,1",
"max_seq_length": 4096,
"gradient_accumulation_steps": 4,
"tensor_parallel_degree": 2,
"pipeline_parallel_degree": 2,
"gradient_accumulation_steps": 2,
"tensor_parallel_degree": 4,
"pipeline_parallel_degree": 1,
"virtual_pp_degree": 1,
"sequence_parallel": 1,
"sharding_parallel_degree": 8,
"sharding": "stage1",
"pipeline_parallel_config": "enable_delay_scale_loss enable_sharding_comm_overlap enable_release_grads ",
"tensor_parallel_config": "enable_delay_scale_loss enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add",
"per_device_train_batch_size": 1,
"pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ",
"tensor_parallel_config": "enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add",
"per_device_train_batch_size": 2,
"use_flash_attention": true,
"use_fused_rms_norm": true,
"fuse_attention_qkv": true,
Expand Down Expand Up @@ -42,5 +42,5 @@
"pp_recompute_interval": 1,
"device": "gpu",
"amp_master_grad": true,
"sharding_parallel_config": "split_param enable_stage1_overlap"
"sharding_parallel_config": "split_param enable_stage1_overlap enable_stage1_allgather_overlap"
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
"sequence_parallel": 1,
"sharding_parallel_degree": 16,
"sharding": "stage1",
"pipeline_parallel_config": "enable_delay_scale_loss enable_sharding_comm_overlap enable_release_grads ",
"tensor_parallel_config": "enable_delay_scale_loss enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add",
"pipeline_parallel_config": "enable_sharding_comm_overlap enable_release_grads ",
"tensor_parallel_config": "enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add",
"per_device_train_batch_size": 2,
"use_flash_attention": true,
"use_fused_rms_norm": true,
Expand Down Expand Up @@ -42,5 +42,5 @@
"pp_recompute_interval": 1,
"device": "gpu",
"amp_master_grad": true,
"sharding_parallel_config": "split_param enable_stage1_overlap"
"sharding_parallel_config": "split_param enable_stage1_overlap enable_stage1_allgather_overlap"
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,30 @@
param="model_name_or_path=qwen/qwen-14b "
param+="per_device_train_batch_size=1 "
param+="data_parallel_degree=1 "
param+="tensor_parallel_degree=1 "
param+="pipeline_parallel_degree=4 "
param+="tensor_parallel_degree=2 "
param+="pipeline_parallel_degree=1 "
param+="virtual_pp_degree=1 "
param+="sequence_parallel=0 "
param+="sharding_parallel_degree=8 "
param+="sharding_parallel_degree=16 "
param+="sharding=stage1 "
param+="recompute=0 "
param+="recompute_granularity=full_attn "
param+="run_mode=MP1-PP4-sharding8-mbs1-acc4 "
param+="run_mode=MP2-PP1-sharding16-mbs1-acc2 "
param+="device_num=N4C32 "
param+="global_batch_size=32 "
param+="model_item=qwen-qwen-14b_seqlen4096_pretrain "
param+="max_steps=100 "
param+="gradient_accumulation_steps=4 "
param+="gradient_accumulation_steps=2 "
param+="pp_recompute_interval=1 "
param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, "
param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, "
#多机新添加的参数
param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads, "
param+="pipeline_parallel_config=enable_sharding_comm_overlap,enable_release_grads, "
param+="max_seq_length=4096 "
param+="min_learning_rate=0.000005 "
param+="save_steps=5000 "
param+="eval_steps=1000 "
param+="scale_loss=1024 "
param+="sharding_parallel_config=split_param,enable_stage1_overlap, "
param+="sharding_parallel_config=split_param,enable_stage1_overlap,enable_stage1_allgather_overlap "


cd ./tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,32 @@


param="model_name_or_path=qwen/qwen-7b "
param+="per_device_train_batch_size=2 "
param+="per_device_train_batch_size=1 "
param+="data_parallel_degree=1 "
param+="tensor_parallel_degree=2 "
param+="tensor_parallel_degree=1 "
param+="pipeline_parallel_degree=1 "
param+="virtual_pp_degree=1 "
param+="sequence_parallel=1 "
param+="sharding_parallel_degree=16 "
param+="sharding_parallel_degree=32 "
param+="sharding=stage1 "
param+="recompute=0 "
param+="recompute_granularity=full_attn "
param+="run_mode=MP2-PP1-sharding16-mbs2-acc1 "
param+="run_mode=MP1-PP1-sharding32-mbs1-acc1 "
param+="device_num=N4C32 "
param+="global_batch_size=32 "
param+="model_item=qwen-qwen-7b_seqlen4096_pretrain "
param+="max_steps=100 "
param+="gradient_accumulation_steps=1 "
param+="pp_recompute_interval=1 "
param+="tensor_parallel_config=enable_delay_scale_loss,enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
#多机新添加的参数
param+="pipeline_parallel_config=enable_delay_scale_loss,enable_sharding_comm_overlap,enable_release_grads "
param+="pipeline_parallel_config=enable_sharding_comm_overlap,enable_release_grads "
param+="max_seq_length=4096 "
param+="min_learning_rate=0.000005 "
param+="save_steps=5000 "
param+="eval_steps=1000 "
param+="scale_loss=1024 "
param+="sharding_parallel_config=split_param,enable_stage1_overlap "
param+="sharding_parallel_config=split_param,enable_stage1_overlap,enable_stage1_allgather_overlap "


cd ./tests
Expand Down

0 comments on commit f069a53

Please sign in to comment.