Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support different image #93

Merged
merged 11 commits into from
Jun 5, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ __Tips:__

* 请根据自己所在地区,选用合适的pip源来配置PIP_SOURCE
* 每次运行可配置多个benchmark case,每个benchmark case可以通过repeat来配置运行次数
* FlagPerf使用CASES变量中的键(key)来索引相应模型(model,如bert),框架(framework,如pytorch),硬件类型(hardware_model,如A100),主机数量(nnodes,如1),计算卡数量(nproc,如8),和重复测试次数(repeat,如1),以冒号:为分隔符,按照“model:framework:hardware_model:nnodes:nproc:repeat”的格式以字符串存储。键对应的值为运行这一样例对应数据/模型权重所在目录
* FlagPerf使用CASES变量中的键(key)来索引相应模型(model,如bert),框架(framework,可选pytorch、pytorch_1.13),硬件类型(hardware_model,如A100),主机数量(nnodes,如1),计算卡数量(nproc,如8),和重复测试次数(repeat,如1),以冒号:为分隔符,按照“model:framework:hardware_model:nnodes:nproc:repeat”的格式以字符串存储。键对应的值为运行这一样例对应数据/模型权重所在目录
* 例如,用户在目录/abc/def/data/存放了模型bert在框架pytorch下面运行的数据集与预训练权重,希望在2机8卡A100(共16卡)的环境上测试这一任务,重复3次取平均值,则需要在CASES中增加"bert:pytorch:A100:2:8:3":"/abc/def/data/"这一键值对。key中的bert为模型,pytorch为框架,A100为硬件类型,2为主机数量,8为每个主机上面的计算卡数量,3为重复次数,"abc/def/data/"为数据和权重的存放路径

```
Expand Down
1 change: 0 additions & 1 deletion training/benchmarks/driver/config_manager.py
upvenly marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ def activate(base_config,
base_config.override(parsed_params.__dict__, False)
else:
_merge_dict_to_config(parsed_params, base_config.__dict__)

if ext_config:
config_path = ext_config
else:
Expand Down
2 changes: 1 addition & 1 deletion training/benchmarks/driver/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ class InitHelper:

def __init__(self, config: object) -> None:
self.config = config
self.update_local_rank()

def init_driver(self, global_module, local_module) -> Driver:
"""
Expand All @@ -28,6 +27,7 @@ def init_driver(self, global_module, local_module) -> Driver:
model_driver.setup_config(argparse.ArgumentParser(config.name))
model_driver.setup_modules(global_module, local_module)
check.check_config(model_driver.config)
self.update_local_rank()
return model_driver

def get_logger(self) -> perf_logger.PerfLogger:
Expand Down
4 changes: 4 additions & 0 deletions training/nvidia/docker_image/pytorch_1.13/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
FROM nvcr.io/nvidia/pytorch:22.11-py3
RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
RUN /bin/bash -c "uname -a"
RUN /bin/bash -c alias python3=python
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#!/bin/bash
8 changes: 8 additions & 0 deletions training/nvidia/wav2vec2-pytorch/config/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
editdistance==0.6.0
librosa==0.8.0
omegaconf==2.0.6 # optional for handling certain Fairseq ckpts
pyarrow==6.0.1
soundfile==0.10.3.post1
sox==1.4.1
tqdm==4.53.0
git+https://github.com/NVIDIA/dllogger@v1.0.0#egg=dllogger
25 changes: 8 additions & 17 deletions training/run_benchmarks/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,17 @@ def check_case_config(case, case_config, vendor):
",".join(must_configs))
return False

framework = case_config["framework"].split("_")[0]
model_path = CURR_PATH + "/../benchmarks/" + case_config["model"] + \
"/" + case_config["framework"]
"/" + framework
model_path = os.path.abspath(model_path)
if not os.path.exists(model_path):
RUN_LOGGER.warning("Case " + case + ": deploy path doesn't exist: " +
model_path)
return False

config_path = CURR_PATH + "/../" + vendor + "/" + case_config["model"] + \
"-" + case_config["framework"] + "/config/" + \
"-" + framework + "/config/" + \
case_config["config"] + ".py"
if not os.path.isfile(config_path):
RUN_LOGGER.warning("Case " + case + ": config file doesn't exist: " +
Expand All @@ -136,7 +137,7 @@ def prepare_docker_image_cluster(dp_path, image_mgr, framework, nnodes):
CURR_PATH, "../" + vendor + "/docker_image/" + framework)
image_name = image_mgr.repository + ":" + image_mgr.tag
RUN_LOGGER.debug("Prepare docker image in cluster. image_name=" +
image_name + "image_vendor_dir=" + image_vendor_dir)
image_name + " image_vendor_dir=" + image_vendor_dir)
prepare_image_cmd = "cd " + dp_path + " && " + sys.executable \
+ " utils/image_manager.py -o build -i " \
+ image_mgr.repository + " -t " + image_mgr.tag \
Expand Down Expand Up @@ -286,23 +287,13 @@ def stop_monitors_in_cluster(dp_path, nnodes):
def start_tasks_in_cluster(dp_path, container_name, case_config, base_args,
count, curr_log_path):
'''Start tasks in cluster, and NOT wait.'''
framework = case_config["framework"]
nnodes = case_config["nnodes"]
env_file = os.path.join(
tc.FLAGPERF_PATH, tc.VENDOR,
case_config["model"] + "-" + case_config["framework"],
"config/environment_variables.sh")
if (os.path.isfile(env_file)):
start_cmd = "cd " + dp_path + " && " + sys.executable \
+ " utils/container_manager.py -o runcmdin -c " \
+ container_name + " -d -r \"source " + env_file \
+ " > " + curr_log_path + "/source_env.log.txt " \
+ "2>&1 && " \
+ "python3 " + tc.FLAGPERF_PATH + "/run_benchmarks/" \
+ framework + "/start_" + framework + "_task.py " \
+ base_args + " --round " + str(count)
else:
start_cmd = "cd " + dp_path + " && " + sys.executable \
framework = case_config["framework"].split("_")[0]
upvenly marked this conversation as resolved.
Show resolved Hide resolved
start_cmd = "cd " + dp_path + " && " + sys.executable \
+ " utils/container_manager.py -o runcmdin -c " \
+ container_name + " -d -r \"" \
+ "python3 " + tc.FLAGPERF_PATH + "/run_benchmarks/" \
Expand Down Expand Up @@ -498,7 +489,7 @@ def prepare_case_config_cluster(dp_path, case_config, case):
RUN_LOGGER.info(config_item + ":\t" + str(case_config[config_item]))
RUN_LOGGER.info("--------------------------------------------------")
model = case_config["model"]
framework = case_config["framework"]
framework = case_config["framework"].split("_")[0]
config_file = case_config["config"] + ".py"
nnodes = case_config["nnodes"]
case_config_dir = os.path.join(dp_path, tc.VENDOR, model + "-" + framework,
Expand Down Expand Up @@ -621,7 +612,7 @@ def main():
RUN_LOGGER.info("3) Waiting for tasks end in the cluster...")
pid_file_path = os.path.join(
log_dir_container,
"start_" + case_config["framework"] + "_task.pid")
"start_" + case_config["framework"].split("_")[0] + "_task.pid")
wait_for_finish(dp_path, container_name, pid_file_path, nnodes)
RUN_LOGGER.info("3) Training tasks end in the cluster...")
RUN_LOGGER.info("4) Clean container environments in cluster...")
Expand Down