diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index b46337679f41..56d8cad73525 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -60,6 +60,7 @@ function llama_case_list_auto() { llama_dygraph_auto_bs8_fp32_DP2-MP2 llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2 llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2 + llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2 llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1 llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1 @@ -1657,6 +1658,99 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() { echo "=========== $FUNCNAME run end ===========" } +function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() { + # Only A100 support this case. + if [ $IS_A100 -eq 0 ]; then + return + fi + echo "=========== $FUNCNAME run begin ===========" + export PYTHONPATH=$root_path/:$PYTHONPATH + export FLAGS_call_stack_level=3 + export NVIDIA_TF32_OVERRIDE=0 + + task_name="llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2" + case_out_dir="output/$task_name" + case_log_dir="output/$task_name""_log" + rm -rf $case_out_dir + rm -rf $case_log_dir + + python -u -m paddle.distributed.launch \ + --gpus "0,1,2,3,4,5,6,7" \ + --log_dir "output/$task_name""_log" \ + ./run_pretrain_auto.py \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --tokenizer_name_or_path "meta-llama/Llama-2-13b" \ + --input_dir "./data" \ + --output_dir "./output" \ + --split 949,50,1 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --max_grad_norm 1.0 \ + --learning_rate 3e-05 \ + --min_learning_rate 3e-06 \ + --max_steps 30 \ + --logging_steps 10 \ + --eval_steps 1000 \ + --save_steps 50000 \ + --continue_training 0 \ + --do_train true \ + --do_eval false \ + --do_predict false \ + --disable_tqdm true \ + --skip_profile_timer true \ + --save_total_limit 2 \ + --device gpu \ + --disable_tqdm true \ + --dataloader_num_workers 1 \ + --distributed_dataloader 0 \ + --enable_auto_parallel 1 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 1 \ + --recompute false \ + --recompute_use_reentrant true \ + --recompute_granularity full \ + --pp_recompute_interval 0 \ + --bf16 true \ + --fp16_opt_level "O2" \ + --amp_master_grad true \ + --fuse_attention_ffn false \ + --fuse_attention_qkv true \ + --fused_linear_param_grad_add 1 \ + --fuse_sequence_parallel_allreduce false \ + --use_flash_attention true \ + --use_fused_rope true \ + --use_fused_rms_norm true \ + --max_seq_length 4096 \ + --sep_parallel_degree 1 \ + --sequence_parallel false \ + --pipeline_parallel_degree 2 \ + --sharding_parallel_degree 2 \ + --tensor_parallel_degree 1 \ + --virtual_pp_degree 3 \ + --pipeline_schedule_mode "VPP" \ + --sharding "stage2" \ + --pipeline_parallel_config "enable_send_recv_overlap" \ + --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \ + --sharding_parallel_config "enable_stage2_overlap" \ + --tensor_parallel_config "enable_mp_async_allreduce" \ + --to_static 1 \ + --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \ + --amp_custom_white_list "lookup_table" "lookup_table_v2" \ + --num_hidden_layers 12 \ + --skip_memory_metrics 0 \ + >>${log_path}/$FUNCNAME 2>&1 + loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_samples_per_second: ' '{print $2}' | awk -F ',' '{print $1}'` + mem=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'current_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'` + echo "result: loss=$loss ips=$ips mem=$mem" + loss_base=7.52383575 + ips_base=12.4135 + mem_base=29.140248775482178 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} + function llm_gpt_dygraph_auto_bs8_fp32_DP2() { echo "=========== $FUNCNAME run begin ===========" export PYTHONPATH=$root_path/:$PYTHONPATH diff --git a/scripts/distribute/run_ci.sh b/scripts/distribute/run_ci.sh index 4c8c011c455e..f558cde651b2 100644 --- a/scripts/distribute/run_ci.sh +++ b/scripts/distribute/run_ci.sh @@ -58,6 +58,14 @@ install_paddlenlp(){ # cd - # python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)"; } + +install_external_ops(){ + echo -e "\033[31m ---- Install extern_ops \033" + export PYTHONPATH=${nlp_dir}:$PYTHONPATH + cd ${nlp_dir}/model_zoo/gpt-3/external_ops + python setup.py install + python -c "import fused_ln;"; +} #################################### get_diff_TO_case(){ cd ${nlp_dir} @@ -127,6 +135,8 @@ if [[ ${#case_list[*]} -ne 0 ]];then install_paddle # Install paddlenlp install_paddlenlp + # Install external_ops + install_external_ops case_num=1 export FLAGS_install_deps=0