From 28b9eb84c14f2d8f78122582f2019b8eb097ce3e Mon Sep 17 00:00:00 2001
From: Liujie0926 <44688141+Liujie0926@users.noreply.github.com>
Date: Mon, 26 Feb 2024 14:11:39 +0800
Subject: [PATCH] [AutoConfig]add N2C16 (#7915)

* add N2C16

* fix

* update N2C16_gbs to 16

* fix env

* fix

* fix

* update

* fix env set

* fix

* fix bug

* update
---
 .../auto_tuner/autoconfig/llama7b_lora.json   |  7 +-
 .../autoconfig/llama7b_lora_N2C16.json        | 85 ++++++++++++++++++
 .../autoconfig/llama7b_pretrain_N2C16.json    | 87 +++++++++++++++++++
 .../auto_tuner/autoconfig/llama7b_sft.json    |  7 +-
 .../autoconfig/llama7b_sft_N2C16.json         | 83 ++++++++++++++++++
 .../CE_autotuner_llama7b_bs16_bf16_lora.sh    | 26 ++++++
 .../CE_autotuner_llama7b_bs16_bf16_sft.sh     | 26 ++++++
 .../benchmark_common/prepare.sh               | 28 ++++--
 .../benchmark_common/run_benchmark.sh         | 29 ++++---
 ...CE_autotuner_llama7b_bs16_bf16_pretrain.sh | 26 ++++++
 .../benchmark_common/prepare.sh               | 28 ++++--
 .../benchmark_common/run_benchmark.sh         | 29 ++++---
 .../llama/benchmark_common/run_benchmark.sh   |  1 +
 13 files changed, 428 insertions(+), 34 deletions(-)
 create mode 100644 tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json
 create mode 100644 tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json
 create mode 100644 tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json
 create mode 100644 tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh
 create mode 100644 tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh
 create mode 100644 tests/test_tipc/auto_tuner/llama_pretrain/N2C16/CE_autotuner_llama7b_bs16_bf16_pretrain.sh

diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json
index c66070f0ea5a..962fa3c3fca5 100644
--- a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json
+++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json
@@ -68,6 +68,10 @@
     "use_recompute": [
       "./autoconfig/llama7b_lora_params.json",
       "recompute"
+    ],
+    "recompute_granularity": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute_granularity"
     ]
   },
   "schedule_prior": [
@@ -76,5 +80,6 @@
   "sharding_degree": "auto",
   "sharding_stage": "auto",
   "task_limit": 2000,
-  "use_recompute": "auto"
+  "use_recompute": "auto",
+  "recompute_granularity":"auto"
 }
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json
new file mode 100644
index 000000000000..4c70c60da92d
--- /dev/null
+++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json
@@ -0,0 +1,85 @@
+{
+  "dp_degree": "auto",
+  "invalid_strategy": [
+    "stage3_mp*"
+  ],
+  "max_search_time": 900,
+  "max_time_per_task": 300,
+  "metric_cfg": {
+    "OptimizationDirection": "Maximize",
+    "name": "interval_samples_per_second"
+  },
+  "micro_batch_size": "auto",
+  "mode": "LoRA",
+  "model_cfg": {
+    "global_batch_size": 16,
+    "hidden_size": 4096,
+    "num_attention_heads": 32,
+    "num_layers": 28,
+    "vocab_size": 65024
+  },
+  "mp_degree": [
+    1
+  ],
+  "need_baseline": true,
+  "pp_degree": [
+    1
+  ],
+  "run_cmd": {
+    "gradient_accumulation_steps": [
+      "./autoconfig/llama7b_lora_params.json",
+      "gradient_accumulation_steps"
+    ],
+    "micro_batch_size": [
+      "./autoconfig/llama7b_lora_params.json",
+      "per_device_train_batch_size"
+    ],
+    "mp_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "tensor_parallel_degree"
+    ],
+    "pp_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "pipeline_parallel_degree"
+    ],
+    "run_best_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_lora_params.json",
+        "autotuner_benchmark",
+        0
+      ]
+    },
+    "search_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_lora_params.json",
+        "autotuner_benchmark",
+        1
+      ]
+    },
+    "sharding_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "sharding_parallel_degree"
+    ],
+    "sharding_stage": [
+      "./autoconfig/llama7b_lora_params.json",
+      "sharding",
+      "stage"
+    ],
+    "use_recompute": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute"
+    ],
+    "recompute_granularity": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute_granularity"
+    ]
+  },
+  "schedule_prior": [
+    "mp4"
+  ],
+  "sharding_degree": "auto",
+  "sharding_stage": "auto",
+  "task_limit": 2000,
+  "use_recompute": "auto",
+  "recompute_granularity":"auto"
+}
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json
new file mode 100644
index 000000000000..3399736118cf
--- /dev/null
+++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json
@@ -0,0 +1,87 @@
+{
+    "dp_degree": "auto",
+    "max_search_time": 900,
+    "max_time_per_task": 400,
+    "metric_cfg": {
+      "OptimizationDirection": "Maximize",
+      "name": "interval_samples_per_second"
+    },
+    "micro_batch_size": "auto",
+    "model_cfg": {
+      "global_batch_size": 16,
+      "hidden_size": 5120,
+      "num_attention_heads": 40,
+      "num_layers": 40,
+      "vocab_size": 32000
+    },
+    "mp_degree": "auto",
+    "pp_degree": "auto",
+    "run_cmd": {
+      "gradient_accumulation_steps": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "gradient_accumulation_steps"
+      ],
+      "micro_batch_size": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "per_device_train_batch_size"
+      ],
+      "mp_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "tensor_parallel_degree"
+      ],
+      "pp_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "pipeline_parallel_degree"
+      ],
+      "run_best_stage": {
+        "continue_training": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "continue_training",
+          0
+        ],
+        "autotuner_benchmark": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "autotuner_benchmark",
+          0
+        ]
+      },
+      "search_stage": {
+        "continue_training": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "continue_training",
+          0
+        ],
+        "autotuner_benchmark": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "autotuner_benchmark",
+          1
+        ]
+      },
+      "sharding_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "sharding_parallel_degree"
+      ],
+      "sharding_stage": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "sharding",
+        "stage"
+      ],
+      "use_recompute": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "recompute"
+      ],
+      "recompute_granularity": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "recompute_granularity"
+      ]
+    },
+    "sharding_degree": "auto",
+    "sharding_stage": "auto",
+    "task_limit": 2000,
+    "use_recompute": "auto",
+    "recompute_granularity": "auto",
+    "invalid_strategy": ["stage3_mp*"],
+    "schedule_prior": ["mp4"],
+    "need_baseline": true,
+    "mode": "Pretrain"
+  }
\ No newline at end of file
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json
index 234704a82071..b296b4edf7bd 100644
--- a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json
+++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json
@@ -66,6 +66,10 @@
     "use_recompute": [
       "./autoconfig/llama7b_sft_params.json",
       "recompute"
+    ],
+    "recompute_granularity": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute_granularity"
     ]
   },
   "schedule_prior": [
@@ -74,5 +78,6 @@
   "sharding_degree": "auto",
   "sharding_stage": "auto",
   "task_limit": 2000,
-  "use_recompute": "auto"
+  "use_recompute": "auto",
+  "recompute_granularity":"auto"
 }
\ No newline at end of file
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json
new file mode 100644
index 000000000000..81c59dd8d86e
--- /dev/null
+++ b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json
@@ -0,0 +1,83 @@
+{
+  "dp_degree": "auto",
+  "invalid_strategy": [
+    "stage3_mp*"
+  ],
+  "max_search_time": 900,
+  "max_time_per_task": 300,
+  "metric_cfg": {
+    "OptimizationDirection": "Maximize",
+    "name": "interval_samples_per_second"
+  },
+  "micro_batch_size": "auto",
+  "mode": "SFT",
+  "model_cfg": {
+    "global_batch_size": 16,
+    "hidden_size": 4096,
+    "num_attention_heads": 32,
+    "num_layers": 28,
+    "vocab_size": 65024
+  },
+  "mp_degree": "auto",
+  "need_baseline": true,
+  "pp_degree": [
+    1
+  ],
+  "run_cmd": {
+    "gradient_accumulation_steps": [
+      "./autoconfig/llama7b_sft_params.json",
+      "gradient_accumulation_steps"
+    ],
+    "micro_batch_size": [
+      "./autoconfig/llama7b_sft_params.json",
+      "per_device_train_batch_size"
+    ],
+    "mp_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "tensor_parallel_degree"
+    ],
+    "pp_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "pipeline_parallel_degree"
+    ],
+    "run_best_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_sft_params.json",
+        "autotuner_benchmark",
+        0
+      ]
+    },
+    "search_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_sft_params.json",
+        "autotuner_benchmark",
+        1
+      ]
+    },
+    "sharding_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "sharding_parallel_degree"
+    ],
+    "sharding_stage": [
+      "./autoconfig/llama7b_sft_params.json",
+      "sharding",
+      "stage"
+    ],
+    "use_recompute": [
+      "./autoconfig/llama7b_sft_params.json",
+      "recompute"
+    ],
+    "recompute_granularity": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute_granularity"
+    ]
+  },
+  "schedule_prior": [
+    "mp4"
+  ],
+  "sharding_degree": "auto",
+  "sharding_stage": "auto",
+  "task_limit": 2000,
+  "use_recompute": "auto",
+  "recompute_granularity":"auto"
+}
\ No newline at end of file
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh
new file mode 100644
index 000000000000..294b9e74d6be
--- /dev/null
+++ b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+param="model_item=CE_autotuner_llama7b "
+param+="run_mode=lora "
+param+="device_num=N2C16 "
+param+="global_batch_size=16 "
+param+="nnodes=2 "
+param+="autoconfig_json_file=autoconfig/llama7b_lora_N2C16.json "
+param+="modle_json_file=autoconfig/llama7b_lora_params.json "
+
+cd ./tests
+bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh multi
+
+bash -c "${param} bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh
new file mode 100644
index 000000000000..e04792ab6e47
--- /dev/null
+++ b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+param="model_item=CE_autotuner_llama7b "
+param+="run_mode=sft "
+param+="device_num=N2C16 "
+param+="global_batch_size=16 "
+param+="nnodes=2 "
+param+="autoconfig_json_file=autoconfig/llama7b_sft_N2C16.json "
+param+="modle_json_file=autoconfig/llama7b_sft_params.json "
+
+cd ./tests
+bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh multi
+
+bash -c "${param} bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
index 3bb53514be7f..2877c55661c7 100644
--- a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
+++ b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
@@ -23,9 +23,25 @@ tar -zxvf AdvertiseGen.tar.gz && rm -rf AdvertiseGen.tar.gz
 # mv autoconfig
 rm -rf autoconfig
 cp -r ../tests/test_tipc/auto_tuner/autoconfig ./
-unset PADDLE_ELASTIC_JOB_ID
-unset PADDLE_TRAINER_ENDPOINTS
-unset DISTRIBUTED_TRAINER_ENDPOINTS
-unset FLAGS_START_PORT
-unset PADDLE_ELASTIC_TIMEOUT
-unset PADDLE_TRAINERS_NUM
+
+if [ -z "$1" ]; then  
+  echo "单机任务"
+else
+  echo "多机任务, 启动etcd服务"
+  pip install httpx etcd3 protobuf==3.20.0 --force-reinstall
+  ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
+  master_ip=${ip_lists[0]}
+  rank=$PADDLE_TRAINER_ID
+  echo $master_ip $rank
+  if [ $rank == 0 ]; then
+    net=$(netstat -anp | grep 2379 | grep "LISTEN")
+    if [ ${#net} == 0 ]; then
+        apt-get install -y --allow-downgrades etcd
+        nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+        ps -ef |grep etcd
+    fi  
+  else
+      sleep 5
+  fi
+  sleep 5
+fi
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh
index bed243d2c022..785adab372df 100644
--- a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh
@@ -35,6 +35,9 @@ function _set_params(){
 
     fp_item="bf16"
     workerlog_id=0
+    ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
+    master_ip=${ip_lists[0]}
+    nnodes=${nnodes:-1}
     # 以下为通用执行命令，无特殊可不用修改
     model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
     device=${CUDA_VISIBLE_DEVICES//,/ }
@@ -74,24 +77,23 @@ function _train(){
         log_file=${train_log_file}
     fi
 
-    if [ ${PADDLE_TRAINER_ID} ]
-    then
-        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
-    else
-        PADDLE_RANK_OPTION=""
-    fi
     # 以下为通用执行命令，无特殊可不用修改
     case ${device_num} in
     N1C1) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
-        train_cmd="python -m paddle.distributed.launch --gpus=0 ${PADDLE_RANK_OPTION}\
+        train_cmd="python -m paddle.distributed.launch --gpus=0 \
             --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}"
         ;;
-    N1C8|N2C16) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
-        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
+    N1C8) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
+        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
             --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}"
         ;;
+    N2C16) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
+        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
+            --auto_tuner_json ${autoconfig_json_file} --master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes \
+            finetune_generation.py ${modle_json_file}"
+        ;;
     *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
-        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
+        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
             --auto_tuner_json ${autoconfig_json_file} finetune_generation.py ${modle_json_file}"
         ;;
     esac
@@ -123,6 +125,13 @@ function _train(){
 }
 
 export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+unset PADDLE_ELASTIC_JOB_ID
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+unset FLAGS_START_PORT
+unset PADDLE_ELASTIC_TIMEOUT
+unset PADDLE_TRAINERS_NUM
+unset PADDLE_TRAINER_ID
 source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
 _set_params $@
 #_train       # 如果只产出训练log,不解析,可取消注释
diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/N2C16/CE_autotuner_llama7b_bs16_bf16_pretrain.sh b/tests/test_tipc/auto_tuner/llama_pretrain/N2C16/CE_autotuner_llama7b_bs16_bf16_pretrain.sh
new file mode 100644
index 000000000000..8f374d1c5e93
--- /dev/null
+++ b/tests/test_tipc/auto_tuner/llama_pretrain/N2C16/CE_autotuner_llama7b_bs16_bf16_pretrain.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+param="model_item=CE_autotuner_llama7b "
+param+="run_mode=pretrain "
+param+="device_num=N2C16 "
+param+="global_batch_size=16 "
+param+="nnodes=2 "
+param+="autoconfig_json_file=autoconfig/llama7b_pretrain_N2C16.json "
+param+="modle_json_file=autoconfig/llama7b_pretrain_params.json "
+
+cd ./tests
+bash ./test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh multi
+
+bash -c "${param} bash ./test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
index 7fe08f4d2e34..24f852b4cb43 100644
--- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
+++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh
@@ -32,9 +32,25 @@ mv llama_openwebtext_100k_idx.npz ./data
 # mv autoconfig
 rm -rf autoconfig
 cp -r ../../tests/test_tipc/auto_tuner/autoconfig ./
-unset PADDLE_ELASTIC_JOB_ID
-unset PADDLE_TRAINER_ENDPOINTS
-unset DISTRIBUTED_TRAINER_ENDPOINTS
-unset FLAGS_START_PORT
-unset PADDLE_ELASTIC_TIMEOUT
-unset PADDLE_TRAINERS_NUM
+
+if [ -z "$1" ]; then  
+  echo "单机任务"
+else
+  echo "多机任务, 启动etcd服务"
+  pip install httpx etcd3 protobuf==3.20.0 --force-reinstall
+  ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
+  master_ip=${ip_lists[0]}
+  rank=$PADDLE_TRAINER_ID
+  echo $master_ip $rank
+  if [ $rank == 0 ]; then
+    net=$(netstat -anp | grep 2379 | grep "LISTEN")
+    if [ ${#net} == 0 ]; then
+        apt-get install -y --allow-downgrades etcd
+        nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+        ps -ef |grep etcd
+    fi  
+  else
+      sleep 5
+  fi
+  sleep 5
+fi
diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh
index 0a82e9bd5464..8055fc75932d 100644
--- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/run_benchmark.sh
@@ -35,6 +35,9 @@ function _set_params(){
 
     fp_item="bf16"
     workerlog_id=0
+    ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
+    master_ip=${ip_lists[0]}
+    nnodes=${nnodes:-1}
     # 以下为通用执行命令，无特殊可不用修改
     model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
     device=${CUDA_VISIBLE_DEVICES//,/ }
@@ -74,24 +77,23 @@ function _train(){
         log_file=${train_log_file}
     fi
 
-    if [ ${PADDLE_TRAINER_ID} ]
-    then
-        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
-    else
-        PADDLE_RANK_OPTION=""
-    fi
     # 以下为通用执行命令，无特殊可不用修改
     case ${device_num} in
     N1C1) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
-        train_cmd="python -m paddle.distributed.launch --gpus=0 ${PADDLE_RANK_OPTION}\
+        train_cmd="python -m paddle.distributed.launch --gpus=0 \
             --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}"
         ;;
-    N1C8|N2C16) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
-        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
+    N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
+        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
             --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}"
         ;;
+    N2C16) echo "Run with: device_num=${device_num} run_mode=${run_mode}"
+        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
+            --auto_tuner_json ${autoconfig_json_file} --master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes \
+            run_pretrain.py ${modle_json_file}"
+        ;;
     *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
-        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
+        train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \
             --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}"
         ;;
     esac
@@ -140,6 +142,13 @@ function _train(){
 }
 
 export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+unset PADDLE_ELASTIC_JOB_ID
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+unset FLAGS_START_PORT
+unset PADDLE_ELASTIC_TIMEOUT
+unset PADDLE_TRAINERS_NUM
+unset PADDLE_TRAINER_ID
 source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
 _set_params $@
 #_train       # 如果只产出训练log,不解析,可取消注释
diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh
index 78a5e832c553..6e4862bf3c43 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama/benchmark_common/run_benchmark.sh
@@ -143,6 +143,7 @@ function _train(){
     --tensor_parallel_config ${tensor_parallel_config} ${pipeline_parallel_config_args} \
     --recompute ${recompute} \
     --recompute_use_reentrant ${recompute_use_reentrant} \
+    --skip_memory_metrics 0 \
     --data_cache ./data_cache"
 
     if [ ${PADDLE_TRAINER_ID} ]