FlagOpen · upvenly · Jun 5, 2023 · May 16, 2023 · May 16, 2023 · May 18, 2023
diff --git a/README.md b/README.md
@@ -220,7 +220,7 @@ __Tips：__
 
  * 请根据自己所在地区，选用合适的pip源来配置PIP_SOURCE
  * 每次运行可配置多个benchmark case，每个benchmark case可以通过repeat来配置运行次数
- * FlagPerf使用CASES变量中的键（key）来索引相应模型（model，如bert），框架（framework，如pytorch），硬件类型（hardware_model，如A100）,主机数量（nnodes，如1），计算卡数量（nproc，如8），和重复测试次数（repeat，如1），以冒号:为分隔符，按照“model:framework:hardware_model:nnodes:nproc:repeat”的格式以字符串存储。键对应的值为运行这一样例对应数据/模型权重所在目录
+ * FlagPerf使用CASES变量中的键（key）来索引相应模型（model，如bert），框架（framework，可选pytorch、pytorch_1.13），硬件类型（hardware_model，如A100）,主机数量（nnodes，如1），计算卡数量（nproc，如8），和重复测试次数（repeat，如1），以冒号:为分隔符，按照“model:framework:hardware_model:nnodes:nproc:repeat”的格式以字符串存储。键对应的值为运行这一样例对应数据/模型权重所在目录
  * 例如，用户在目录/abc/def/data/存放了模型bert在框架pytorch下面运行的数据集与预训练权重，希望在2机8卡A100（共16卡）的环境上测试这一任务，重复3次取平均值，则需要在CASES中增加"bert:pytorch:A100:2:8:3":"/abc/def/data/"这一键值对。key中的bert为模型，pytorch为框架，A100为硬件类型，2为主机数量，8为每个主机上面的计算卡数量，3为重复次数，"abc/def/data/"为数据和权重的存放路径
 
 ```

diff --git a/training/benchmarks/driver/config_manager.py b/training/benchmarks/driver/config_manager.py
@@ -136,7 +136,6 @@ def activate(base_config,
         base_config.override(parsed_params.__dict__, False)
     else:
         _merge_dict_to_config(parsed_params, base_config.__dict__)
-
     if ext_config:
         config_path = ext_config
     else:

diff --git a/training/benchmarks/driver/helper.py b/training/benchmarks/driver/helper.py
@@ -16,7 +16,6 @@ class InitHelper:
 
     def __init__(self, config: object) -> None:
         self.config = config
-        self.update_local_rank()
 
     def init_driver(self, global_module, local_module) -> Driver:
         """
@@ -28,6 +27,7 @@ def init_driver(self, global_module, local_module) -> Driver:
         model_driver.setup_config(argparse.ArgumentParser(config.name))
         model_driver.setup_modules(global_module, local_module)
         check.check_config(model_driver.config)
+        self.update_local_rank()
         return model_driver
 
     def get_logger(self) -> perf_logger.PerfLogger:

diff --git a/training/nvidia/docker_image/pytorch_1.13/Dockerfile b/training/nvidia/docker_image/pytorch_1.13/Dockerfile
@@ -0,0 +1,4 @@
+FROM nvcr.io/nvidia/pytorch:22.11-py3
+RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
diff --git a/training/nvidia/docker_image/pytorch_1.13/pytorch1.13_install.sh b/training/nvidia/docker_image/pytorch_1.13/pytorch1.13_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/training/nvidia/wav2vec2-pytorch/config/requirements.txt b/training/nvidia/wav2vec2-pytorch/config/requirements.txt
@@ -0,0 +1,8 @@
+editdistance==0.6.0
+librosa==0.8.0
+omegaconf==2.0.6  # optional for handling certain Fairseq ckpts
+pyarrow==6.0.1
+soundfile==0.10.3.post1
+sox==1.4.1
+tqdm==4.53.0
+git+https://github.com/NVIDIA/dllogger@v1.0.0#egg=dllogger
diff --git a/training/run_benchmarks/run.py b/training/run_benchmarks/run.py
@@ -100,16 +100,17 @@ def check_case_config(case, case_config, vendor):
                            ",".join(must_configs))
         return False
 
+    framework = case_config["framework"].split("_")[0]
     model_path = CURR_PATH + "/../benchmarks/" + case_config["model"] + \
-                 "/" + case_config["framework"]
+                 "/" + framework
     model_path = os.path.abspath(model_path)
     if not os.path.exists(model_path):
         RUN_LOGGER.warning("Case " + case + ": deploy path doesn't exist: " +
                            model_path)
         return False
 
     config_path = CURR_PATH + "/../" + vendor + "/" + case_config["model"] + \
-        "-" + case_config["framework"] + "/config/" + \
+        "-" + framework + "/config/" + \
         case_config["config"] + ".py"
     if not os.path.isfile(config_path):
         RUN_LOGGER.warning("Case " + case + ": config file doesn't exist: " +
@@ -136,7 +137,7 @@ def prepare_docker_image_cluster(dp_path, image_mgr, framework, nnodes):
         CURR_PATH, "../" + vendor + "/docker_image/" + framework)
     image_name = image_mgr.repository + ":" + image_mgr.tag
     RUN_LOGGER.debug("Prepare docker image in cluster. image_name=" +
-                     image_name + "image_vendor_dir=" + image_vendor_dir)
+                     image_name + " image_vendor_dir=" + image_vendor_dir)
     prepare_image_cmd = "cd " + dp_path + " && " + sys.executable \
                         + " utils/image_manager.py -o build -i " \
                         + image_mgr.repository + " -t " + image_mgr.tag \
@@ -286,23 +287,13 @@ def stop_monitors_in_cluster(dp_path, nnodes):
 def start_tasks_in_cluster(dp_path, container_name, case_config, base_args,
                            count, curr_log_path):
     '''Start tasks in cluster, and NOT wait.'''
-    framework = case_config["framework"]
     nnodes = case_config["nnodes"]
     env_file = os.path.join(
         tc.FLAGPERF_PATH, tc.VENDOR,
         case_config["model"] + "-" + case_config["framework"],
         "config/environment_variables.sh")
-    if (os.path.isfile(env_file)):
-        start_cmd = "cd " + dp_path + " && " + sys.executable \
-                + " utils/container_manager.py -o runcmdin -c " \
-                + container_name + " -d -r \"source " + env_file \
-                + " > " + curr_log_path + "/source_env.log.txt " \
-                + "2>&1 && " \
-                + "python3 " + tc.FLAGPERF_PATH + "/run_benchmarks/" \
-                + framework + "/start_" + framework + "_task.py " \
-                + base_args + " --round " + str(count)
-    else:
-        start_cmd = "cd " + dp_path + " && " + sys.executable \
+    framework = case_config["framework"].split("_")[0]
+    start_cmd = "cd " + dp_path + " && " + sys.executable \
                 + " utils/container_manager.py -o runcmdin -c " \
                 + container_name + " -d -r \"" \
                 + "python3 " + tc.FLAGPERF_PATH + "/run_benchmarks/" \
@@ -498,7 +489,7 @@ def prepare_case_config_cluster(dp_path, case_config, case):
         RUN_LOGGER.info(config_item + ":\t" + str(case_config[config_item]))
     RUN_LOGGER.info("--------------------------------------------------")
     model = case_config["model"]
-    framework = case_config["framework"]
+    framework = case_config["framework"].split("_")[0]
     config_file = case_config["config"] + ".py"
     nnodes = case_config["nnodes"]
     case_config_dir = os.path.join(dp_path, tc.VENDOR, model + "-" + framework,
@@ -621,7 +612,7 @@ def main():
             RUN_LOGGER.info("3) Waiting for tasks end in the cluster...")
             pid_file_path = os.path.join(
                 log_dir_container,
-                "start_" + case_config["framework"] + "_task.pid")
+                "start_" + case_config["framework"].split("_")[0] + "_task.pid")
             wait_for_finish(dp_path, container_name, pid_file_path, nnodes)
             RUN_LOGGER.info("3) Training tasks end in the cluster...")
             RUN_LOGGER.info("4) Clean container environments in cluster...")