Skip to content

Commit

Permalink
【AutoParallel】Add llama2 UT for auto-parallel (#8300)
Browse files Browse the repository at this point in the history
* add llama UT

* add llama UT

* update data

* instatll external ops

* update data

* update data
  • Loading branch information
heavyrain-lzy authored Apr 24, 2024
1 parent 273c593 commit af3041c
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 0 deletions.
94 changes: 94 additions & 0 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ function llama_case_list_auto() {
llama_dygraph_auto_bs8_fp32_DP2-MP2
llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2

llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1
Expand Down Expand Up @@ -1657,6 +1658,99 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
echo "=========== $FUNCNAME run end ==========="
}

function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
# Only A100 support this case.
if [ $IS_A100 -eq 0 ]; then
return
fi
echo "=========== $FUNCNAME run begin ==========="
export PYTHONPATH=$root_path/:$PYTHONPATH
export FLAGS_call_stack_level=3
export NVIDIA_TF32_OVERRIDE=0

task_name="llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2"
case_out_dir="output/$task_name"
case_log_dir="output/$task_name""_log"
rm -rf $case_out_dir
rm -rf $case_log_dir

python -u -m paddle.distributed.launch \
--gpus "0,1,2,3,4,5,6,7" \
--log_dir "output/$task_name""_log" \
./run_pretrain_auto.py \
--model_name_or_path "meta-llama/Llama-2-13b" \
--tokenizer_name_or_path "meta-llama/Llama-2-13b" \
--input_dir "./data" \
--output_dir "./output" \
--split 949,50,1 \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--max_grad_norm 1.0 \
--learning_rate 3e-05 \
--min_learning_rate 3e-06 \
--max_steps 30 \
--logging_steps 10 \
--eval_steps 1000 \
--save_steps 50000 \
--continue_training 0 \
--do_train true \
--do_eval false \
--do_predict false \
--disable_tqdm true \
--skip_profile_timer true \
--save_total_limit 2 \
--device gpu \
--disable_tqdm true \
--dataloader_num_workers 1 \
--distributed_dataloader 0 \
--enable_auto_parallel 1 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 4 \
--per_device_eval_batch_size 1 \
--recompute false \
--recompute_use_reentrant true \
--recompute_granularity full \
--pp_recompute_interval 0 \
--bf16 true \
--fp16_opt_level "O2" \
--amp_master_grad true \
--fuse_attention_ffn false \
--fuse_attention_qkv true \
--fused_linear_param_grad_add 1 \
--fuse_sequence_parallel_allreduce false \
--use_flash_attention true \
--use_fused_rope true \
--use_fused_rms_norm true \
--max_seq_length 4096 \
--sep_parallel_degree 1 \
--sequence_parallel false \
--pipeline_parallel_degree 2 \
--sharding_parallel_degree 2 \
--tensor_parallel_degree 1 \
--virtual_pp_degree 3 \
--pipeline_schedule_mode "VPP" \
--sharding "stage2" \
--pipeline_parallel_config "enable_send_recv_overlap" \
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
--sharding_parallel_config "enable_stage2_overlap" \
--tensor_parallel_config "enable_mp_async_allreduce" \
--to_static 1 \
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
--num_hidden_layers 12 \
--skip_memory_metrics 0 \
>>${log_path}/$FUNCNAME 2>&1
loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
ips=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_samples_per_second: ' '{print $2}' | awk -F ',' '{print $1}'`
mem=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'current_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=7.52383575
ips_base=12.4135
mem_base=29.140248775482178
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
echo "=========== $FUNCNAME run begin ==========="
export PYTHONPATH=$root_path/:$PYTHONPATH
Expand Down
10 changes: 10 additions & 0 deletions scripts/distribute/run_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ install_paddlenlp(){
# cd -
# python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)";
}

install_external_ops(){
echo -e "\033[31m ---- Install extern_ops \033"
export PYTHONPATH=${nlp_dir}:$PYTHONPATH
cd ${nlp_dir}/model_zoo/gpt-3/external_ops
python setup.py install
python -c "import fused_ln;";
}
####################################
get_diff_TO_case(){
cd ${nlp_dir}
Expand Down Expand Up @@ -127,6 +135,8 @@ if [[ ${#case_list[*]} -ne 0 ]];then
install_paddle
# Install paddlenlp
install_paddlenlp
# Install external_ops
install_external_ops

case_num=1
export FLAGS_install_deps=0
Expand Down

0 comments on commit af3041c

Please sign in to comment.