From 28b9eb84c14f2d8f78122582f2019b8eb097ce3e Mon Sep 17 00:00:00 2001 From: Liujie0926 <44688141+Liujie0926@users.noreply.github.com> Date: Mon, 26 Feb 2024 14:11:39 +0800 Subject: [PATCH] [AutoConfig]add N2C16 (#7915) * add N2C16 * fix * update N2C16_gbs to 16 * fix env * fix * fix * update * fix env set * fix * fix bug * update --- .../auto_tuner/autoconfig/llama7b_lora.json | 7 +- .../autoconfig/llama7b_lora_N2C16.json | 85 ++++++++++++++++++ .../autoconfig/llama7b_pretrain_N2C16.json | 87 +++++++++++++++++++ .../auto_tuner/autoconfig/llama7b_sft.json | 7 +- .../autoconfig/llama7b_sft_N2C16.json | 83 ++++++++++++++++++ .../CE_autotuner_llama7b_bs16_bf16_lora.sh | 26 ++++++ .../CE_autotuner_llama7b_bs16_bf16_sft.sh | 26 ++++++ .../benchmark_common/prepare.sh | 28 ++++-- .../benchmark_common/run_benchmark.sh | 29 ++++--- ...CE_autotuner_llama7b_bs16_bf16_pretrain.sh | 26 ++++++ .../benchmark_common/prepare.sh | 28 ++++-- .../benchmark_common/run_benchmark.sh | 29 ++++--- .../llama/benchmark_common/run_benchmark.sh | 1 + 13 files changed, 428 insertions(+), 34 deletions(-) create mode 100644 tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json create mode 100644 tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json create mode 100644 tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json create mode 100644 tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh create mode 100644 tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh create mode 100644 tests/test_tipc/auto_tuner/llama_pretrain/N2C16/CE_autotuner_llama7b_bs16_bf16_pretrain.sh diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json index c66070f0ea5a..962fa3c3fca5 100644 --- a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json +++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json @@ -68,6 +68,10 @@ "use_recompute": [ "./autoconfig/llama7b_lora_params.json", "recompute" + ], + "recompute_granularity": [ + "./autoconfig/llama7b_lora_params.json", + "recompute_granularity" ] }, "schedule_prior": [ @@ -76,5 +80,6 @@ "sharding_degree": "auto", "sharding_stage": "auto", "task_limit": 2000, - "use_recompute": "auto" + "use_recompute": "auto", + "recompute_granularity":"auto" } diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json new file mode 100644 index 000000000000..4c70c60da92d --- /dev/null +++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json @@ -0,0 +1,85 @@ +{ + "dp_degree": "auto", + "invalid_strategy": [ + "stage3_mp*" + ], + "max_search_time": 900, + "max_time_per_task": 300, + "metric_cfg": { + "OptimizationDirection": "Maximize", + "name": "interval_samples_per_second" + }, + "micro_batch_size": "auto", + "mode": "LoRA", + "model_cfg": { + "global_batch_size": 16, + "hidden_size": 4096, + "num_attention_heads": 32, + "num_layers": 28, + "vocab_size": 65024 + }, + "mp_degree": [ + 1 + ], + "need_baseline": true, + "pp_degree": [ + 1 + ], + "run_cmd": { + "gradient_accumulation_steps": [ + "./autoconfig/llama7b_lora_params.json", + "gradient_accumulation_steps" + ], + "micro_batch_size": [ + "./autoconfig/llama7b_lora_params.json", + "per_device_train_batch_size" + ], + "mp_degree": [ + "./autoconfig/llama7b_lora_params.json", + "tensor_parallel_degree" + ], + "pp_degree": [ + "./autoconfig/llama7b_lora_params.json", + "pipeline_parallel_degree" + ], + "run_best_stage": { + "autotuner_benchmark": [ + "./autoconfig/llama7b_lora_params.json", + "autotuner_benchmark", + 0 + ] + }, + "search_stage": { + "autotuner_benchmark": [ + "./autoconfig/llama7b_lora_params.json", + "autotuner_benchmark", + 1 + ] + }, + "sharding_degree": [ + "./autoconfig/llama7b_lora_params.json", + "sharding_parallel_degree" + ], + "sharding_stage": [ + "./autoconfig/llama7b_lora_params.json", + "sharding", + "stage" + ], + "use_recompute": [ + "./autoconfig/llama7b_lora_params.json", + "recompute" + ], + "recompute_granularity": [ + "./autoconfig/llama7b_lora_params.json", + "recompute_granularity" + ] + }, + "schedule_prior": [ + "mp4" + ], + "sharding_degree": "auto", + "sharding_stage": "auto", + "task_limit": 2000, + "use_recompute": "auto", + "recompute_granularity":"auto" +} diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json new file mode 100644 index 000000000000..3399736118cf --- /dev/null +++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json @@ -0,0 +1,87 @@ +{ + "dp_degree": "auto", + "max_search_time": 900, + "max_time_per_task": 400, + "metric_cfg": { + "OptimizationDirection": "Maximize", + "name": "interval_samples_per_second" + }, + "micro_batch_size": "auto", + "model_cfg": { + "global_batch_size": 16, + "hidden_size": 5120, + "num_attention_heads": 40, + "num_layers": 40, + "vocab_size": 32000 + }, + "mp_degree": "auto", + "pp_degree": "auto", + "run_cmd": { + "gradient_accumulation_steps": [ + "./autoconfig/llama7b_pretrain_params.json", + "gradient_accumulation_steps" + ], + "micro_batch_size": [ + "./autoconfig/llama7b_pretrain_params.json", + "per_device_train_batch_size" + ], + "mp_degree": [ + "./autoconfig/llama7b_pretrain_params.json", + "tensor_parallel_degree" + ], + "pp_degree": [ + "./autoconfig/llama7b_pretrain_params.json", + "pipeline_parallel_degree" + ], + "run_best_stage": { + "continue_training": [ + "./autoconfig/llama7b_pretrain_params.json", + "continue_training", + 0 + ], + "autotuner_benchmark": [ + "./autoconfig/llama7b_pretrain_params.json", + "autotuner_benchmark", + 0 + ] + }, + "search_stage": { + "continue_training": [ + "./autoconfig/llama7b_pretrain_params.json", + "continue_training", + 0 + ], + "autotuner_benchmark": [ + "./autoconfig/llama7b_pretrain_params.json", + "autotuner_benchmark", + 1 + ] + }, + "sharding_degree": [ + "./autoconfig/llama7b_pretrain_params.json", + "sharding_parallel_degree" + ], + "sharding_stage": [ + "./autoconfig/llama7b_pretrain_params.json", + "sharding", + "stage" + ], + "use_recompute": [ + "./autoconfig/llama7b_pretrain_params.json", + "recompute" + ], + "recompute_granularity": [ + "./autoconfig/llama7b_pretrain_params.json", + "recompute_granularity" + ] + }, + "sharding_degree": "auto", + "sharding_stage": "auto", + "task_limit": 2000, + "use_recompute": "auto", + "recompute_granularity": "auto", + "invalid_strategy": ["stage3_mp*"], + "schedule_prior": ["mp4"], + "need_baseline": true, + "mode": "Pretrain" + } \ No newline at end of file diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json index 234704a82071..b296b4edf7bd 100644 --- a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json +++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json @@ -66,6 +66,10 @@ "use_recompute": [ "./autoconfig/llama7b_sft_params.json", "recompute" + ], + "recompute_granularity": [ + "./autoconfig/llama7b_lora_params.json", + "recompute_granularity" ] }, "schedule_prior": [ @@ -74,5 +78,6 @@ "sharding_degree": "auto", "sharding_stage": "auto", "task_limit": 2000, - "use_recompute": "auto" + "use_recompute": "auto", + "recompute_granularity":"auto" } \ No newline at end of file diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json new file mode 100644 index 000000000000..81c59dd8d86e --- /dev/null +++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json @@ -0,0 +1,83 @@ +{ + "dp_degree": "auto", + "invalid_strategy": [ + "stage3_mp*" + ], + "max_search_time": 900, + "max_time_per_task": 300, + "metric_cfg": { + "OptimizationDirection": "Maximize", + "name": "interval_samples_per_second" + }, + "micro_batch_size": "auto", + "mode": "SFT", + "model_cfg": { + "global_batch_size": 16, + "hidden_size": 4096, + "num_attention_heads": 32, + "num_layers": 28, + "vocab_size": 65024 + }, + "mp_degree": "auto", + "need_baseline": true, + "pp_degree": [ + 1 + ], + "run_cmd": { + "gradient_accumulation_steps": [ + "./autoconfig/llama7b_sft_params.json", + "gradient_accumulation_steps" + ], + "micro_batch_size": [ + "./autoconfig/llama7b_sft_params.json", + "per_device_train_batch_size" + ], + "mp_degree": [ + "./autoconfig/llama7b_sft_params.json", + "tensor_parallel_degree" + ], + "pp_degree": [ + "./autoconfig/llama7b_sft_params.json", + "pipeline_parallel_degree" + ], + "run_best_stage": { + "autotuner_benchmark": [ + "./autoconfig/llama7b_sft_params.json", + "autotuner_benchmark", + 0 + ] + }, + "search_stage": { + "autotuner_benchmark": [ + "./autoconfig/llama7b_sft_params.json", + "autotuner_benchmark", + 1 + ] + }, + "sharding_degree": [ + "./autoconfig/llama7b_sft_params.json", + "sharding_parallel_degree" + ], + "sharding_stage": [ + "./autoconfig/llama7b_sft_params.json", + "sharding", + "stage" + ], + "use_recompute": [ + "./autoconfig/llama7b_sft_params.json", + "recompute" + ], + "recompute_granularity": [ + "./autoconfig/llama7b_lora_params.json", + "recompute_granularity" + ] + }, + "schedule_prior": [ + "mp4" + ], + "sharding_degree": "auto", + "sharding_stage": "auto", + "task_limit": 2000, + "use_recompute": "auto", + "recompute_granularity":"auto" +} \ No newline at end of file diff --git a/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh new file mode 100644 index 000000000000..294b9e74d6be --- /dev/null +++ b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +param="model_item=CE_autotuner_llama7b " +param+="run_mode=lora " +param+="device_num=N2C16 " +param+="global_batch_size=16 " +param+="nnodes=2 " +param+="autoconfig_json_file=autoconfig/llama7b_lora_N2C16.json " +param+="modle_json_file=autoconfig/llama7b_lora_params.json " + +cd ./tests +bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh multi + +bash -c "${param} bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh new file mode 100644 index 000000000000..e04792ab6e47 --- /dev/null +++ b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +param="model_item=CE_autotuner_llama7b " +param+="run_mode=sft " +param+="device_num=N2C16 " +param+="global_batch_size=16 " +param+="nnodes=2 " +param+="autoconfig_json_file=autoconfig/llama7b_sft_N2C16.json " +param+="modle_json_file=autoconfig/llama7b_sft_params.json " + +cd ./tests +bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh multi + +bash -c "${param} bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh index 3bb53514be7f..2877c55661c7 100644 --- a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh +++ b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh @@ -23,9 +23,25 @@ tar -zxvf AdvertiseGen.tar.gz && rm -rf AdvertiseGen.tar.gz # mv autoconfig rm -rf autoconfig cp -r ../tests/test_tipc/auto_tuner/autoconfig ./ -unset PADDLE_ELASTIC_JOB_ID -unset PADDLE_TRAINER_ENDPOINTS -unset DISTRIBUTED_TRAINER_ENDPOINTS -unset FLAGS_START_PORT -unset PADDLE_ELASTIC_TIMEOUT -unset PADDLE_TRAINERS_NUM + +if [ -z "$1" ]; then + echo "单机任务" +else + echo "多机任务, 启动etcd服务" + pip install httpx etcd3 protobuf==3.20.0 --force-reinstall + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + rank=$PADDLE_TRAINER_ID + echo $master_ip $rank + if [ $rank == 0 ]; then + net=$(netstat -anp | grep 2379 | grep "LISTEN") + if [ ${#net} == 0 ]; then + apt-get install -y --allow-downgrades etcd + nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 & + ps -ef |grep etcd + fi + else + sleep 5 + fi + sleep 5 +fi diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh index bed243d2c022..785adab372df 100644 --- a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh @@ -35,6 +35,9 @@ function _set_params(){ fp_item="bf16" workerlog_id=0 + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + nnodes=${nnodes:-1} # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } @@ -74,24 +77,23 @@ function _train(){ log_file=${train_log_file} fi - if [ ${PADDLE_TRAINER_ID} ] - then - PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" - else - PADDLE_RANK_OPTION="" - fi # 以下为通用执行命令,无特殊可不用修改 case ${device_num} in N1C1) echo "Run with: device_num=${device_num} run_mode=${run_mode}" - train_cmd="python -m paddle.distributed.launch --gpus=0 ${PADDLE_RANK_OPTION}\ + train_cmd="python -m paddle.distributed.launch --gpus=0 \ --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}" ;; - N1C8|N2C16) echo "Run with: device_num=${device_num} run_mode=${run_mode}" - train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ + N1C8) echo "Run with: device_num=${device_num} run_mode=${run_mode}" + train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}" ;; + N2C16) echo "Run with: device_num=${device_num} run_mode=${run_mode}" + train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --auto_tuner_json ${autoconfig_json_file} --master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes \ + finetune_generation.py ${modle_json_file}" + ;; *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" - train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ + train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}" ;; esac @@ -123,6 +125,13 @@ function _train(){ } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +unset PADDLE_TRAINERS_NUM +unset PADDLE_TRAINER_ID source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/N2C16/CE_autotuner_llama7b_bs16_bf16_pretrain.sh b/tests/test_tipc/auto_tuner/llama_pretrain/N2C16/CE_autotuner_llama7b_bs16_bf16_pretrain.sh new file mode 100644 index 000000000000..8f374d1c5e93 --- /dev/null +++ b/tests/test_tipc/auto_tuner/llama_pretrain/N2C16/CE_autotuner_llama7b_bs16_bf16_pretrain.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +param="model_item=CE_autotuner_llama7b " +param+="run_mode=pretrain " +param+="device_num=N2C16 " +param+="global_batch_size=16 " +param+="nnodes=2 " +param+="autoconfig_json_file=autoconfig/llama7b_pretrain_N2C16.json " +param+="modle_json_file=autoconfig/llama7b_pretrain_params.json " + +cd ./tests +bash ./test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh multi + +bash -c "${param} bash ./test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh index 7fe08f4d2e34..24f852b4cb43 100644 --- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh +++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh @@ -32,9 +32,25 @@ mv llama_openwebtext_100k_idx.npz ./data # mv autoconfig rm -rf autoconfig cp -r ../../tests/test_tipc/auto_tuner/autoconfig ./ -unset PADDLE_ELASTIC_JOB_ID -unset PADDLE_TRAINER_ENDPOINTS -unset DISTRIBUTED_TRAINER_ENDPOINTS -unset FLAGS_START_PORT -unset PADDLE_ELASTIC_TIMEOUT -unset PADDLE_TRAINERS_NUM + +if [ -z "$1" ]; then + echo "单机任务" +else + echo "多机任务, 启动etcd服务" + pip install httpx etcd3 protobuf==3.20.0 --force-reinstall + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + rank=$PADDLE_TRAINER_ID + echo $master_ip $rank + if [ $rank == 0 ]; then + net=$(netstat -anp | grep 2379 | grep "LISTEN") + if [ ${#net} == 0 ]; then + apt-get install -y --allow-downgrades etcd + nohup etcd -data-dir ~/data.etcd -advertise-client-urls http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 & + ps -ef |grep etcd + fi + else + sleep 5 + fi + sleep 5 +fi diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh index 0a82e9bd5464..8055fc75932d 100644 --- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh @@ -35,6 +35,9 @@ function _set_params(){ fp_item="bf16" workerlog_id=0 + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + nnodes=${nnodes:-1} # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } @@ -74,24 +77,23 @@ function _train(){ log_file=${train_log_file} fi - if [ ${PADDLE_TRAINER_ID} ] - then - PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" - else - PADDLE_RANK_OPTION="" - fi # 以下为通用执行命令,无特殊可不用修改 case ${device_num} in N1C1) echo "Run with: device_num=${device_num} run_mode=${run_mode}" - train_cmd="python -m paddle.distributed.launch --gpus=0 ${PADDLE_RANK_OPTION}\ + train_cmd="python -m paddle.distributed.launch --gpus=0 \ --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}" ;; - N1C8|N2C16) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" - train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ + N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}" ;; + N2C16) echo "Run with: device_num=${device_num} run_mode=${run_mode}" + train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --auto_tuner_json ${autoconfig_json_file} --master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes \ + run_pretrain.py ${modle_json_file}" + ;; *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" - train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ + train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}" ;; esac @@ -140,6 +142,13 @@ function _train(){ } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +unset PADDLE_TRAINERS_NUM +unset PADDLE_TRAINER_ID source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh index 78a5e832c553..6e4862bf3c43 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh @@ -143,6 +143,7 @@ function _train(){ --tensor_parallel_config ${tensor_parallel_config} ${pipeline_parallel_config_args} \ --recompute ${recompute} \ --recompute_use_reentrant ${recompute_use_reentrant} \ + --skip_memory_metrics 0 \ --data_cache ./data_cache" if [ ${PADDLE_TRAINER_ID} ]