FlagOpen · upvenly · May 25, 2023 · May 17, 2023 · May 19, 2023 · May 24, 2023
diff --git a/training/run_benchmarks/README_dev.md b/training/run_benchmarks/README_dev.md
@@ -0,0 +1,64 @@
+# dev.py功能说明
+
+### 总体概述
+
+1. 不考虑标准输出的内容，dev.py在功能上与run.py完全一致。即：运行dev.py，也能够执行testconfig中的对应case
+2. dev.py仅在标准输出上与run.py有区别。dev.py屏蔽了host所有的标准输出（不包括容器内的，因此不影响result中的log），改为输出以下信息：
+
+```shell
+Command 1, run at host
+	xxx
+Command 2, run at host
+    xxx
+Command 3: run at host
+[INFO] Command 3 let you go into docker(container)
+    xxx
+Command 4, run at docker(container)
+[INFO] If you set nnodes != 1, you should run command 1-3 on each hosts, then run the corresponding command 4 respectively
+    Command 4 at host yyy:
+        xxx
+    Command 4 at host zzz:
+        xxx
+```
+
+### 设计思路
+
+* 区分在容器外和容器内的执行命令，提供容器内执行命令，方便开发者直接在容器中调试。
+* 在“case尚未添加完毕”的时候，方便开发者跳过flagperf启动、加载、检查等环节，直接执行自己case对应的task
+
+### 简单使用步骤
+
+##### 验证已有case
+
+0. 检查case路径设置，如CASES = {"cpm:pytorch:A100:1:8:1": "/home/datasets_ckpt/cpm/train/"}, 当字典中多个k-v时，行为与执行run.py一致。会按照python解释器的字典key值遍历顺序遍历所有case，依次输出对应case的四条命令、执行case（建议使用dev.py功能时，在CASES中只放置一对儿k-v）
+
+1. 已有case的文件结构已经设置正确。因此直接在运行这一case的时候，将run.py更改为dev.py，即可获取到dev.py的专有输出，获取相应命令
+2. 可以在相应主机上，运行command1-4，手动完成“run.py”的大部分流程
+3. 在command4运行完后，可继续执行command4，再次运行这一case，不需要关闭现有容器、启动新容器、配置新容器等步骤
+
+##### 添加新case过程中
+
+* 本文件的主要功能时，在case尚未正确添加的过程中，帮助开发者跳过相关流程
+
+1. 在test_config中写好即将添加的case
+2. 在nvidia/下面添加“{model}-{framework}/”目录，在{model}-{framework}/目录下添加config/与extern/目录，在config/目录下添加config_{hardware}x{nnode}x{nproc}.py
+3. 在benchmarks/下面添加{model}/目录，在{model}/目录下添加{framework}/目录，在{framework}/目录下添加run_pretraining.py
+4. 此时即可执行dev.py，获取在flagperf框架中，启动当前配置的case所需的命令，并进行调试
+5. 后续开发可仅在benchmarks/{model}/{framework}/下面进行，无需再关注flagperf框架
+
+* 下面以添加faster_rcnn模型pytorch框架标准case（nvidia A100 1\*8）作为样例。添加前首先将数据集、backbone权重等文件存放在了/home/xxx/目录下
+
+	1. 在test_config中，将CASES写为{'faster_rcnn:pytorch:A100:1:8:1':'/home/xxx'}
+
+ 	2. 在nvidia/下面添加faster_rcnn-pytorch/，在faster_rcnn-pytorch/目录下添加config/与extern/目录，在config/目录下添加config_A100x1x8.py
+ 	3. 在benchmarks/下面添加faster_rcnn/目录，在faster_rcnn/目录下添加pytorch/目录，在pytorch/目录下添加run_pretraining.py
+ 	4. 运行dev.py 获取了包含4条命令的输出
+
+此时，已经完成了dev.py的功能
+
+5. 输入命令1-3
+6. 输入命令4。因为此时run_pretraining.py是空的，因此不会有任何效果
+7. 在benchmarks/faster_rcnn/pytorch/下面添加各种trainer/，dataloader/，model/等目录，按照文档标准填写run_pretraining.py，完成case编写
+8. 在步骤7的过程中，可反复使用命令4进行调试
+9. 调试完毕，退出容器，编写nvidia/faster_rcnn-pytorch/下面各文件
+10. 进行完整验证，填写case readme，提交PR
diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py
@@ -6,5 +6,8 @@
 # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored'
 HOSTS_PORTS = ["2222"]
 
+# Master port to connect
+MASTER_PORT = "29501"
+
 # ssh connection port
 SSH_PORT = "22"
diff --git a/training/run_benchmarks/dev.py b/training/run_benchmarks/dev.py
@@ -305,17 +305,27 @@ def stop_monitors_in_cluster(dp_path, nnodes):
 
 
 def start_tasks_in_cluster(dp_path, container_name, case_config, base_args,
-                           count, stdout, nullout):
+                           count, stdout, nullout, curr_log_path):
     '''Start tasks in cluster, and NOT wait.'''
     framework = case_config["framework"]
     nnodes = case_config["nnodes"]
     env_file = os.path.join(
         tc.FLAGPERF_PATH, tc.VENDOR,
         case_config["model"] + "-" + case_config["framework"],
         "config/environment_variables.sh")
-    start_cmd = "cd " + dp_path + " && " + sys.executable \
+    if (os.path.isfile(env_file)):
+        start_cmd = "cd " + dp_path + " && " + sys.executable \
+                + " utils/container_manager.py -o runcmdin -c " \
+                + container_name + " -d -r \"source " + env_file \
+                + " > " + curr_log_path + "/source_env.log.txt " \
+                + "2>&1 && " \
+                + "python3 " + tc.FLAGPERF_PATH + "/run_benchmarks/" \
+                + framework + "/start_" + framework + "_task.py " \
+                + base_args + " --round " + str(count)
+    else:
+        start_cmd = "cd " + dp_path + " && " + sys.executable \
                 + " utils/container_manager.py -o runcmdin -c " \
-                + container_name + " -d -r \"source " + env_file + "; " \
+                + container_name + " -d -r \"" \
                 + "python3 " + tc.FLAGPERF_PATH + "/run_benchmarks/" \
                 + framework + "/start_" + framework + "_task.py " \
                 + base_args + " --round " + str(count)
@@ -626,7 +636,8 @@ def main(stdout, nullout):
                     + " --log_dir " + log_dir_container \
                     + " --log_level " + tc.FLAGPERF_LOG_LEVEL \
                     + " --extern_config_file " + case_config["config"] \
-                    + ".py" + " --enable_extern_config "
+                    + ".py" + " --enable_extern_config " \
+                    + " --master_port " + cc.MASTER_PORT
         RUN_LOGGER.info("=== 2.2 Prepare case config in cluster. ===")
         if not prepare_case_config_cluster(dp_path, case_config, case):
             RUN_LOGGER.warning("Prepare case config in cluster...[FAILED]. " +
@@ -648,7 +659,8 @@ def main(stdout, nullout):
                 continue
             RUN_LOGGER.info("2) Start tasks in the cluster...")
             start_tasks_in_cluster(dp_path, container_name, case_config,
-                                   base_args, count, stdout, nullout)
+                                   base_args, count, stdout, nullout,
+                                   curr_log_path)
 
             # Wait until start_xxx_task.py finished.
             RUN_LOGGER.info("3) Waiting for tasks end in the cluster...")

diff --git a/training/run_benchmarks/run.py b/training/run_benchmarks/run.py
@@ -284,17 +284,27 @@ def stop_monitors_in_cluster(dp_path, nnodes):
 
 
 def start_tasks_in_cluster(dp_path, container_name, case_config, base_args,
-                           count):
+                           count, curr_log_path):
     '''Start tasks in cluster, and NOT wait.'''
     framework = case_config["framework"]
     nnodes = case_config["nnodes"]
     env_file = os.path.join(
         tc.FLAGPERF_PATH, tc.VENDOR,
         case_config["model"] + "-" + case_config["framework"],
         "config/environment_variables.sh")
-    start_cmd = "cd " + dp_path + " && " + sys.executable \
+    if (os.path.isfile(env_file)):
+        start_cmd = "cd " + dp_path + " && " + sys.executable \
+                + " utils/container_manager.py -o runcmdin -c " \
+                + container_name + " -d -r \"source " + env_file \
+                + " > " + curr_log_path + "/source_env.log.txt " \
+                + "2>&1 && " \
+                + "python3 " + tc.FLAGPERF_PATH + "/run_benchmarks/" \
+                + framework + "/start_" + framework + "_task.py " \
+                + base_args + " --round " + str(count)
+    else:
+        start_cmd = "cd " + dp_path + " && " + sys.executable \
                 + " utils/container_manager.py -o runcmdin -c " \
-                + container_name + " -d -r \"source " + env_file + "; " \
+                + container_name + " -d -r \"" \
                 + "python3 " + tc.FLAGPERF_PATH + "/run_benchmarks/" \
                 + framework + "/start_" + framework + "_task.py " \
                 + base_args + " --round " + str(count)
@@ -582,7 +592,8 @@ def main():
                     + " --log_dir " + log_dir_container \
                     + " --log_level " + tc.FLAGPERF_LOG_LEVEL \
                     + " --extern_config_file " + case_config["config"] \
-                    + ".py" + " --enable_extern_config "
+                    + ".py" + " --enable_extern_config " \
+                    + " --master_port " + cc.MASTER_PORT
         RUN_LOGGER.info("=== 2.2 Prepare case config in cluster. ===")
         if not prepare_case_config_cluster(dp_path, case_config, case):
             RUN_LOGGER.warning("Prepare case config in cluster...[FAILED]. " +
@@ -604,7 +615,7 @@ def main():
                 continue
             RUN_LOGGER.info("2) Start tasks in the cluster...")
             start_tasks_in_cluster(dp_path, container_name, case_config,
-                                   base_args, count)
+                                   base_args, count, curr_log_path)
 
             # Wait until start_xxx_task.py finished.
             RUN_LOGGER.info("3) Waiting for tasks end in the cluster...")