forked from PaddlePaddle/PaddleRec
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request PaddlePaddle#797 from wangzhen38/gpups_1n1p
add gpups_1n1c benchmark
- Loading branch information
Showing
7 changed files
with
649 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
#!/bin/bash | ||
source test_tipc/common_func.sh | ||
|
||
# set env | ||
python=python | ||
export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` | ||
export model_commit=$(git log|head -n1|awk '{print $2}') | ||
export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) | ||
export frame_version=${str_tmp%%.post*} | ||
export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) | ||
|
||
# run benchmark sh | ||
# Usage: | ||
# bash run_benchmark_train.sh config.txt params | ||
# or | ||
# bash run_benchmark_train.sh config.txt | ||
|
||
function func_parser_params(){ | ||
strs=$1 | ||
IFS="=" | ||
array=(${strs}) | ||
tmp=${array[1]} | ||
echo ${tmp} | ||
} | ||
|
||
function func_sed_params(){ | ||
filename=$1 | ||
line=$2 | ||
param_value=$3 | ||
params=`sed -n "${line}p" $filename` | ||
IFS=":" | ||
array=(${params}) | ||
key=${array[0]} | ||
value=${array[1]} | ||
if [[ $value =~ 'benchmark_train' ]];then | ||
IFS='=' | ||
_val=(${value}) | ||
param_value="${_val[0]}=${param_value}" | ||
fi | ||
new_params="${key}:${param_value}" | ||
IFS=";" | ||
cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'" | ||
eval $cmd | ||
} | ||
|
||
function set_gpu_id(){ | ||
string=$1 | ||
_str=${string:1:6} | ||
IFS="C" | ||
arr=(${_str}) | ||
M=${arr[0]} | ||
P=${arr[1]} | ||
gn=`expr $P - 1` | ||
gpu_num=`expr $gn / $M` | ||
seq=`seq -s "," 0 $gpu_num` | ||
echo $seq | ||
} | ||
|
||
function get_repo_name(){ | ||
IFS=";" | ||
cur_dir=$(pwd) | ||
IFS="/" | ||
arr=(${cur_dir}) | ||
echo ${arr[-1]} | ||
} | ||
|
||
FILENAME=$1 | ||
# copy FILENAME as new | ||
new_filename="./test_tipc/benchmark_train.txt" | ||
cmd=`yes|cp $FILENAME $new_filename` | ||
FILENAME=$new_filename | ||
# MODE must be one of ['benchmark_train'] | ||
MODE=$2 | ||
PARAMS=$3 | ||
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1 | ||
IFS=$'\n' | ||
# parser params from train_benchmark.txt | ||
dataline=`cat $FILENAME` | ||
# parser params | ||
IFS=$'\n' | ||
lines=(${dataline}) | ||
model_name=$(func_parser_value "${lines[1]}") | ||
|
||
# 获取benchmark_params所在的行数 | ||
line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` | ||
# for train log parser | ||
batch_size=$(func_parser_value "${lines[line_num]}") | ||
line_num=`expr $line_num + 1` | ||
epoch=$(func_parser_value "${lines[line_num]}") | ||
fp_items="null" | ||
line_num=`expr $line_num + 1` | ||
profile_option_key=$(func_parser_key "${lines[line_num]}") | ||
profile_option_params=$(func_parser_value "${lines[line_num]}") | ||
profile_option="${profile_option_key}:${profile_option_params}" | ||
|
||
line_num=`expr $line_num + 1` | ||
flags_value=$(func_parser_value "${lines[line_num]}") | ||
# set flags | ||
IFS=";" | ||
flags_list=(${flags_value}) | ||
for _flag in ${flags_list[*]}; do | ||
cmd="export ${_flag}" | ||
eval $cmd | ||
done | ||
|
||
# set log_name | ||
repo_name=$(get_repo_name ) | ||
SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log | ||
mkdir -p "${SAVE_LOG}/benchmark_log/" | ||
status_log="${SAVE_LOG}/benchmark_log/results.log" | ||
|
||
# The number of lines in which train params can be replaced. | ||
line_python=3 | ||
line_gpuid=4 | ||
line_precision=6 | ||
line_epoch=7 | ||
line_batchsize=9 | ||
line_profile=13 | ||
line_eval_py=24 | ||
line_export_py=30 | ||
|
||
func_sed_params "$FILENAME" "${line_eval_py}" "null" | ||
func_sed_params "$FILENAME" "${line_export_py}" "null" | ||
func_sed_params "$FILENAME" "${line_python}" "$python" | ||
|
||
# if params | ||
if [ ! -n "$PARAMS" ] ;then | ||
# PARAMS input is not a word. | ||
IFS="|" | ||
batch_size_list=(${batch_size}) | ||
fp_items_list=(${fp_items}) | ||
device_num_list=(N1C4) | ||
run_mode="DP" | ||
else | ||
# parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num} | ||
IFS="_" | ||
params_list=(${PARAMS}) | ||
model_type=${params_list[0]} | ||
batch_size=${params_list[1]} | ||
batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` | ||
precision=${params_list[2]} | ||
# run_process_type=${params_list[3]} | ||
run_mode=${params_list[3]} | ||
device_num=${params_list[4]} | ||
IFS=";" | ||
|
||
fp_items_list=($precision) | ||
batch_size_list=($batch_size) | ||
device_num_list=($device_num) | ||
fi | ||
|
||
IFS="|" | ||
for batch_size in ${batch_size_list[*]}; do | ||
for precision in ${fp_items_list[*]}; do | ||
for device_num in ${device_num_list[*]}; do | ||
# sed batchsize and precision | ||
func_sed_params "$FILENAME" "${line_precision}" "$precision" | ||
func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size" | ||
func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch" | ||
gpu_id=$(set_gpu_id $device_num) | ||
|
||
if [ ${#gpu_id} -le 1 ];then | ||
run_process_type="SingleP" | ||
log_path="$SAVE_LOG/profiling_log" | ||
mkdir -p $log_path | ||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling" | ||
func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id | ||
# set profile_option params | ||
tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` | ||
|
||
# run test_train_inference_python.sh | ||
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " | ||
echo $cmd | ||
eval $cmd | ||
eval "cat ${log_path}/${log_name}" | ||
|
||
# without profile | ||
log_path="$SAVE_LOG/train_log" | ||
speed_log_path="$SAVE_LOG/index" | ||
mkdir -p $log_path | ||
mkdir -p $speed_log_path | ||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" | ||
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" | ||
func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null | ||
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " | ||
echo $cmd | ||
job_bt=`date '+%Y%m%d%H%M%S'` | ||
eval $cmd | ||
job_et=`date '+%Y%m%d%H%M%S'` | ||
export model_run_time=$((${job_et}-${job_bt})) | ||
eval "cat ${log_path}/${log_name}" | ||
|
||
# parser log | ||
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" | ||
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ | ||
--speed_log_file '${speed_log_path}/${speed_log_name}' \ | ||
--model_name ${_model_name} \ | ||
--base_batch_size ${batch_size} \ | ||
--run_mode ${run_mode} \ | ||
--fp_item ${precision} \ | ||
--keyword ips: \ | ||
--skip_steps 2 \ | ||
--device_num ${device_num} \ | ||
--speed_unit example/s \ | ||
--convergence_key auc: " | ||
echo $cmd | ||
eval $cmd | ||
last_status=${PIPESTATUS[0]} | ||
status_check $last_status "${cmd}" "${status_log}" | ||
else | ||
IFS=";" | ||
unset_env=`unset CUDA_VISIBLE_DEVICES` | ||
run_process_type="MultiP" | ||
log_path="$SAVE_LOG/train_log" | ||
speed_log_path="$SAVE_LOG/index" | ||
mkdir -p $log_path | ||
mkdir -p $speed_log_path | ||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" | ||
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" | ||
func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id | ||
func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null | ||
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " | ||
echo $cmd | ||
job_bt=`date '+%Y%m%d%H%M%S'` | ||
eval $cmd | ||
job_et=`date '+%Y%m%d%H%M%S'` | ||
export model_run_time=$((${job_et}-${job_bt})) | ||
eval "cat ${log_path}/${log_name}" | ||
# parser log | ||
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" | ||
|
||
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ | ||
--speed_log_file '${speed_log_path}/${speed_log_name}' \ | ||
--model_name ${_model_name} \ | ||
--base_batch_size ${batch_size} \ | ||
--run_mode ${run_mode} \ | ||
--fp_item ${precision} \ | ||
--keyword ips: \ | ||
--skip_steps 2 \ | ||
--device_num ${device_num} \ | ||
--speed_unit example/s \ | ||
--convergence_key auc: " | ||
echo $cmd | ||
eval $cmd | ||
last_status=${PIPESTATUS[0]} | ||
status_check $last_status "${cmd}" "${status_log}" | ||
fi | ||
done | ||
done | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
===========================train_params=========================== | ||
model_name:dnn | ||
python:python3.7 | ||
gpu_list:0|-1 | ||
runner.use_gpu:True|False | ||
auto_cast:False | ||
runner.epochs:lite_train_lite_infer=4|whole_train_whole_infer=4|whole_infer=4|lite_train_whole_infer=4 | ||
runner.model_save_path | ||
runner.train_batch_size:lite_train_lite_infer=50|whole_train_whole_infer=512|whole_infer=50|lite_train_whole_infer=50 | ||
runner.infer_load_path:null | ||
train_model_name:lite_train_lite_infer=3|whole_train_whole_infer=3|whole_infer=3|lite_train_whole_infer=3 | ||
runner.test_data_dir:test_tipc/data/infer | ||
runner.train_data_dir:../../../test_tipc/data/train | ||
## | ||
trainer:norm_train | ||
norm_train:-u tools/trainer.py -m ./models/rank/dnn/config_bigdata.yaml -o runner.print_interval=2 | ||
pact_train:null | ||
fpgm_train:null | ||
distill_train:null | ||
null:null | ||
null:null | ||
## | ||
===========================eval_params=========================== | ||
eval:null | ||
null:null | ||
## | ||
===========================infer_params=========================== | ||
runner.model_save_path: | ||
runner.model_init_path: | ||
norm_export:-u tools/to_static.py -m ./models/rank/dnn/config_bigdata.yaml -o runner.CE=true | ||
quant_export:null | ||
fpgm_export:null | ||
distill_export:null | ||
null:null | ||
null:null | ||
## | ||
infer_model:test_tipc/save_dnn_model | ||
infer_export:null | ||
infer_quant:False | ||
inference:-u tools/paddle_infer.py --model_name=dnn --reader_file=models/rank/dnn/criteo_reader.py | ||
--use_gpu:True|False | ||
--enable_mkldnn:True|False | ||
--cpu_threads:1|6 | ||
--batchsize:10 | ||
--enable_tensorRT:True|False | ||
--precision:fp32 | ||
--model_dir: | ||
--data_dir:test_tipc/data/infer | ||
--save_log_path:./test_tipc/output/ | ||
--benchmark:True | ||
null:null | ||
===========================train_benchmark_params=========================== | ||
batch_size:2048 | ||
epoch:50 | ||
--profiler_options="batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile" | ||
run_mode:PSGPU | ||
fp_items:null |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# TIPC Linux端Benchmark测试文档 | ||
|
||
该文档为Benchmark测试说明,Benchmark预测功能测试的主程序为`benchmark_train.sh`,用于验证监控模型训练的性能。 | ||
|
||
# 1. 测试流程 | ||
## 1.1 准备数据和环境安装 | ||
运行`test_tipc/prepare.sh`,完成训练数据准备和安装环境流程。 | ||
|
||
```shell | ||
# 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode | ||
bash test_tipc/prepare.sh test_tipc/configs/dnn/train_infer_python.txt benchmark_train | ||
``` | ||
|
||
## 1.2 功能测试 | ||
执行`test_tipc/benchmark_train.sh`,完成模型训练和日志解析 | ||
|
||
```shell | ||
# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode | ||
bash test_tipc/benchmark_train.sh test_tipc/configs/dnn/train_infer_python.txt benchmark_train | ||
``` | ||
|
||
`test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下: | ||
```shell | ||
# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode | ||
bash test_tipc/benchmark_train.sh test_tipc/configs/dnn/train_infer_python.txt benchmark_train null_bs8_null_null_N1C8 | ||
``` | ||
dynamic_bs8_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下: | ||
`${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}` | ||
包含的信息有:模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡(N1C1)。 | ||
|
||
|
||
## 2. 日志输出 | ||
benchmark训练得到训练日志后,会自动保存训练日志并解析得到ips等信息, 在benchmark测试时,会自动调用{benchmark_root}/scrips/analysis.py | ||
|
||
BENCHMARK_ROOT 通过设置环境变量的方式来设置,比如: | ||
``` | ||
export BENCHMARK_ROOT=/paddle/PaddleRec/test_tipc | ||
benchmark_train.sh在运行时会自动调用/paddle/PaddleRec/test_tipc/scripts/analysis.py | ||
``` | ||
运行后将保存模型的训练日志和解析日志,使用 `test_tipc/configs/dnn/train_benchmark.txt` 参数文件的训练日志解析结果是: | ||
|
||
``` | ||
{"model_branch": "gpups", "model_commit": "2ccd243761b39dffe037cef5160dda722f121311", "model_name": "dnn_bs2048_3_MultiP_DP", "batch_size": 2048, "fp_item": "3", "run_mode": "DP", "convergence_value": 0, "convergence_key": "loss:", "ips": 0, "speed_unit": "", "device_num": "N1C4", "model_run_time": "0", "frame_commit": "360b8383250774108a6561e7071d60189b0d0964", "frame_version": "0.0.0"} | ||
``` | ||
|
||
训练日志和日志解析结果保存在benchmark_log目录下,文件组织格式如下: | ||
``` | ||
train_log/ | ||
├── index | ||
│ ├── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C1_speed | ||
│ └── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C4_speed | ||
├── profiling_log | ||
│ └── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C1_profiling | ||
└── train_log | ||
├── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C1_log | ||
└── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C4_log | ||
``` |
Oops, something went wrong.