[AutoConfig]add N2C16 (#7915)

* add N2C16 * fix * update N2C16_gbs to 16 * fix env * fix * fix * update * fix env set * fix * fix bug * update
PaddlePaddle · Feb 26, 2024 · 28b9eb8 · 28b9eb8
1 parent e8d6233
commit 28b9eb8
Show file tree

Hide file tree

Showing 13 changed files with 428 additions and 34 deletions.
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora.json
@@ -68,6 +68,10 @@
     "use_recompute": [
       "./autoconfig/llama7b_lora_params.json",
       "recompute"
+    ],
+    "recompute_granularity": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute_granularity"
     ]
   },
   "schedule_prior": [
@@ -76,5 +80,6 @@
   "sharding_degree": "auto",
   "sharding_stage": "auto",
   "task_limit": 2000,
-  "use_recompute": "auto"
+  "use_recompute": "auto",
+  "recompute_granularity":"auto"
 }
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_lora_N2C16.json
@@ -0,0 +1,85 @@
+{
+  "dp_degree": "auto",
+  "invalid_strategy": [
+    "stage3_mp*"
+  ],
+  "max_search_time": 900,
+  "max_time_per_task": 300,
+  "metric_cfg": {
+    "OptimizationDirection": "Maximize",
+    "name": "interval_samples_per_second"
+  },
+  "micro_batch_size": "auto",
+  "mode": "LoRA",
+  "model_cfg": {
+    "global_batch_size": 16,
+    "hidden_size": 4096,
+    "num_attention_heads": 32,
+    "num_layers": 28,
+    "vocab_size": 65024
+  },
+  "mp_degree": [
+    1
+  ],
+  "need_baseline": true,
+  "pp_degree": [
+    1
+  ],
+  "run_cmd": {
+    "gradient_accumulation_steps": [
+      "./autoconfig/llama7b_lora_params.json",
+      "gradient_accumulation_steps"
+    ],
+    "micro_batch_size": [
+      "./autoconfig/llama7b_lora_params.json",
+      "per_device_train_batch_size"
+    ],
+    "mp_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "tensor_parallel_degree"
+    ],
+    "pp_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "pipeline_parallel_degree"
+    ],
+    "run_best_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_lora_params.json",
+        "autotuner_benchmark",
+        0
+      ]
+    },
+    "search_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_lora_params.json",
+        "autotuner_benchmark",
+        1
+      ]
+    },
+    "sharding_degree": [
+      "./autoconfig/llama7b_lora_params.json",
+      "sharding_parallel_degree"
+    ],
+    "sharding_stage": [
+      "./autoconfig/llama7b_lora_params.json",
+      "sharding",
+      "stage"
+    ],
+    "use_recompute": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute"
+    ],
+    "recompute_granularity": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute_granularity"
+    ]
+  },
+  "schedule_prior": [
+    "mp4"
+  ],
+  "sharding_degree": "auto",
+  "sharding_stage": "auto",
+  "task_limit": 2000,
+  "use_recompute": "auto",
+  "recompute_granularity":"auto"
+}
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_pretrain_N2C16.json
@@ -0,0 +1,87 @@
+{
+    "dp_degree": "auto",
+    "max_search_time": 900,
+    "max_time_per_task": 400,
+    "metric_cfg": {
+      "OptimizationDirection": "Maximize",
+      "name": "interval_samples_per_second"
+    },
+    "micro_batch_size": "auto",
+    "model_cfg": {
+      "global_batch_size": 16,
+      "hidden_size": 5120,
+      "num_attention_heads": 40,
+      "num_layers": 40,
+      "vocab_size": 32000
+    },
+    "mp_degree": "auto",
+    "pp_degree": "auto",
+    "run_cmd": {
+      "gradient_accumulation_steps": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "gradient_accumulation_steps"
+      ],
+      "micro_batch_size": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "per_device_train_batch_size"
+      ],
+      "mp_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "tensor_parallel_degree"
+      ],
+      "pp_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "pipeline_parallel_degree"
+      ],
+      "run_best_stage": {
+        "continue_training": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "continue_training",
+          0
+        ],
+        "autotuner_benchmark": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "autotuner_benchmark",
+          0
+        ]
+      },
+      "search_stage": {
+        "continue_training": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "continue_training",
+          0
+        ],
+        "autotuner_benchmark": [
+          "./autoconfig/llama7b_pretrain_params.json",
+          "autotuner_benchmark",
+          1
+        ]
+      },
+      "sharding_degree": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "sharding_parallel_degree"
+      ],
+      "sharding_stage": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "sharding",
+        "stage"
+      ],
+      "use_recompute": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "recompute"
+      ],
+      "recompute_granularity": [
+        "./autoconfig/llama7b_pretrain_params.json",
+        "recompute_granularity"
+      ]
+    },
+    "sharding_degree": "auto",
+    "sharding_stage": "auto",
+    "task_limit": 2000,
+    "use_recompute": "auto",
+    "recompute_granularity": "auto",
+    "invalid_strategy": ["stage3_mp*"],
+    "schedule_prior": ["mp4"],
+    "need_baseline": true,
+    "mode": "Pretrain"
+  }
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft.json
@@ -66,6 +66,10 @@
     "use_recompute": [
       "./autoconfig/llama7b_sft_params.json",
       "recompute"
+    ],
+    "recompute_granularity": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute_granularity"
     ]
   },
   "schedule_prior": [
@@ -74,5 +78,6 @@
   "sharding_degree": "auto",
   "sharding_stage": "auto",
   "task_limit": 2000,
-  "use_recompute": "auto"
+  "use_recompute": "auto",
+  "recompute_granularity":"auto"
 }
diff --git a/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json b/tests/test_tipc/auto_tuner/autoconfig/llama7b_sft_N2C16.json
@@ -0,0 +1,83 @@
+{
+  "dp_degree": "auto",
+  "invalid_strategy": [
+    "stage3_mp*"
+  ],
+  "max_search_time": 900,
+  "max_time_per_task": 300,
+  "metric_cfg": {
+    "OptimizationDirection": "Maximize",
+    "name": "interval_samples_per_second"
+  },
+  "micro_batch_size": "auto",
+  "mode": "SFT",
+  "model_cfg": {
+    "global_batch_size": 16,
+    "hidden_size": 4096,
+    "num_attention_heads": 32,
+    "num_layers": 28,
+    "vocab_size": 65024
+  },
+  "mp_degree": "auto",
+  "need_baseline": true,
+  "pp_degree": [
+    1
+  ],
+  "run_cmd": {
+    "gradient_accumulation_steps": [
+      "./autoconfig/llama7b_sft_params.json",
+      "gradient_accumulation_steps"
+    ],
+    "micro_batch_size": [
+      "./autoconfig/llama7b_sft_params.json",
+      "per_device_train_batch_size"
+    ],
+    "mp_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "tensor_parallel_degree"
+    ],
+    "pp_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "pipeline_parallel_degree"
+    ],
+    "run_best_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_sft_params.json",
+        "autotuner_benchmark",
+        0
+      ]
+    },
+    "search_stage": {
+      "autotuner_benchmark": [
+        "./autoconfig/llama7b_sft_params.json",
+        "autotuner_benchmark",
+        1
+      ]
+    },
+    "sharding_degree": [
+      "./autoconfig/llama7b_sft_params.json",
+      "sharding_parallel_degree"
+    ],
+    "sharding_stage": [
+      "./autoconfig/llama7b_sft_params.json",
+      "sharding",
+      "stage"
+    ],
+    "use_recompute": [
+      "./autoconfig/llama7b_sft_params.json",
+      "recompute"
+    ],
+    "recompute_granularity": [
+      "./autoconfig/llama7b_lora_params.json",
+      "recompute_granularity"
+    ]
+  },
+  "schedule_prior": [
+    "mp4"
+  ],
+  "sharding_degree": "auto",
+  "sharding_stage": "auto",
+  "task_limit": 2000,
+  "use_recompute": "auto",
+  "recompute_granularity":"auto"
+}
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_lora.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+param="model_item=CE_autotuner_llama7b "
+param+="run_mode=lora "
+param+="device_num=N2C16 "
+param+="global_batch_size=16 "
+param+="nnodes=2 "
+param+="autoconfig_json_file=autoconfig/llama7b_lora_N2C16.json "
+param+="modle_json_file=autoconfig/llama7b_lora_params.json "
+
+cd ./tests
+bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh multi
+
+bash -c "${param} bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh b/tests/test_tipc/auto_tuner/llama_finetune/N2C16/CE_autotuner_llama7b_bs16_bf16_sft.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+param="model_item=CE_autotuner_llama7b "
+param+="run_mode=sft "
+param+="device_num=N2C16 "
+param+="global_batch_size=16 "
+param+="nnodes=2 "
+param+="autoconfig_json_file=autoconfig/llama7b_sft_N2C16.json "
+param+="modle_json_file=autoconfig/llama7b_sft_params.json "
+
+cd ./tests
+bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh multi
+
+bash -c "${param} bash ./test_tipc/auto_tuner/llama_finetune/benchmark_common/run_benchmark.sh"
diff --git a/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_finetune/benchmark_common/prepare.sh
@@ -23,9 +23,25 @@ tar -zxvf AdvertiseGen.tar.gz && rm -rf AdvertiseGen.tar.gz
 # mv autoconfig
 rm -rf autoconfig
 cp -r ../tests/test_tipc/auto_tuner/autoconfig ./
-unset PADDLE_ELASTIC_JOB_ID
-unset PADDLE_TRAINER_ENDPOINTS
-unset DISTRIBUTED_TRAINER_ENDPOINTS
-unset FLAGS_START_PORT
-unset PADDLE_ELASTIC_TIMEOUT
-unset PADDLE_TRAINERS_NUM
+
+if [ -z "$1" ]; then  
+  echo "单机任务"
+else
+  echo "多机任务, 启动etcd服务"
+  pip install httpx etcd3 protobuf==3.20.0 --force-reinstall
+  ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' '))
+  master_ip=${ip_lists[0]}
+  rank=$PADDLE_TRAINER_ID
+  echo $master_ip $rank
+  if [ $rank == 0 ]; then
+    net=$(netstat -anp | grep 2379 | grep "LISTEN")
+    if [ ${#net} == 0 ]; then
+        apt-get install -y --allow-downgrades etcd
+        nohup etcd -data-dir ~/data.etcd -advertise-client-urls  http://0.0.0.0:2379 -listen-client-urls http://0.0.0.0:2379 &
+        ps -ef |grep etcd
+    fi  
+  else
+      sleep 5
+  fi
+  sleep 5
+fi