From d444a9a00beb44e7ebe63a943e7f376746731bcb Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Wed, 15 Jun 2022 12:44:57 +0000 Subject: [PATCH 01/10] add gpups_1n1c --- test_tipc/benchmark_train.sh | 256 ++++++++++++++++ test_tipc/configs/dnn/train_infer_python.txt | 55 ++++ test_tipc/doc/benchmark_train.md | 57 ++++ test_tipc/prepare.sh | 91 +++--- test_tipc/scripts/analysis.py | 300 +++++++++++++++++++ test_tipc/test_train_inference_python.sh | 78 ++++- tools/profiler.py | 110 +++++++ tools/static_gpubox_trainer.py | 13 + 8 files changed, 901 insertions(+), 59 deletions(-) create mode 100644 test_tipc/benchmark_train.sh create mode 100755 test_tipc/configs/dnn/train_infer_python.txt create mode 100644 test_tipc/doc/benchmark_train.md mode change 100644 => 100755 test_tipc/prepare.sh create mode 100644 test_tipc/scripts/analysis.py mode change 100644 => 100755 test_tipc/test_train_inference_python.sh create mode 100644 tools/profiler.py diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh new file mode 100644 index 000000000..84935c272 --- /dev/null +++ b/test_tipc/benchmark_train.sh @@ -0,0 +1,256 @@ +#!/bin/bash +source test_tipc/common_func.sh + +# set env +python=python +export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` +export model_commit=$(git log|head -n1|awk '{print $2}') +export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) + +# run benchmark sh +# Usage: +# bash run_benchmark_train.sh config.txt params +# or +# bash run_benchmark_train.sh config.txt + +function func_parser_params(){ + strs=$1 + IFS="=" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_sed_params(){ + filename=$1 + line=$2 + param_value=$3 + params=`sed -n "${line}p" $filename` + IFS=":" + array=(${params}) + key=${array[0]} + value=${array[1]} + if [[ $value =~ 'benchmark_train' ]];then + IFS='=' + _val=(${value}) + param_value="${_val[0]}=${param_value}" + fi + new_params="${key}:${param_value}" + IFS=";" + cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'" + eval $cmd +} + +function set_gpu_id(){ + string=$1 + _str=${string:1:6} + IFS="C" + arr=(${_str}) + M=${arr[0]} + P=${arr[1]} + gn=`expr $P - 1` + gpu_num=`expr $gn / $M` + seq=`seq -s "," 0 $gpu_num` + echo $seq +} + +function get_repo_name(){ + IFS=";" + cur_dir=$(pwd) + IFS="/" + arr=(${cur_dir}) + echo ${arr[-1]} +} + +FILENAME=$1 +# copy FILENAME as new +new_filename="./test_tipc/benchmark_train.txt" +cmd=`yes|cp $FILENAME $new_filename` +FILENAME=$new_filename +# MODE must be one of ['benchmark_train'] +MODE=$2 +PARAMS=$3 +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1 +IFS=$'\n' +# parser params from train_benchmark.txt +dataline=`cat $FILENAME` +# parser params +IFS=$'\n' +lines=(${dataline}) +model_name=$(func_parser_value "${lines[1]}") + +# 获取benchmark_params所在的行数 +line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` +# for train log parser +batch_size=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +fp_items=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +epoch=$(func_parser_value "${lines[line_num]}") + +line_num=`expr $line_num + 1` +profile_option_key=$(func_parser_key "${lines[line_num]}") +profile_option_params=$(func_parser_value "${lines[line_num]}") +profile_option="${profile_option_key}:${profile_option_params}" + +line_num=`expr $line_num + 1` +flags_value=$(func_parser_value "${lines[line_num]}") +# set flags +IFS=";" +flags_list=(${flags_value}) +for _flag in ${flags_list[*]}; do + cmd="export ${_flag}" + eval $cmd +done + +# set log_name +repo_name=$(get_repo_name ) +SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log +mkdir -p "${SAVE_LOG}/benchmark_log/" +status_log="${SAVE_LOG}/benchmark_log/results.log" + +# The number of lines in which train params can be replaced. +line_python=3 +line_gpuid=4 +line_precision=6 +line_epoch=7 +line_batchsize=9 +line_profile=13 +line_eval_py=24 +line_export_py=30 + +func_sed_params "$FILENAME" "${line_eval_py}" "null" +func_sed_params "$FILENAME" "${line_export_py}" "null" +func_sed_params "$FILENAME" "${line_python}" "$python" + +# if params +if [ ! -n "$PARAMS" ] ;then + # PARAMS input is not a word. + IFS="|" + batch_size_list=(${batch_size}) + fp_items_list=(${fp_items}) + device_num_list=(N1C4) + run_mode="DP" +else + # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num} + IFS="_" + params_list=(${PARAMS}) + model_type=${params_list[0]} + batch_size=${params_list[1]} + batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` + precision=${params_list[2]} + # run_process_type=${params_list[3]} + run_mode=${params_list[3]} + device_num=${params_list[4]} + IFS=";" + + if [ ${precision} = "null" ];then + precision="fp32" + fi + + fp_items_list=($precision) + batch_size_list=($batch_size) + device_num_list=($device_num) +fi + +IFS="|" +for batch_size in ${batch_size_list[*]}; do + for precision in ${fp_items_list[*]}; do + for device_num in ${device_num_list[*]}; do + # sed batchsize and precision + func_sed_params "$FILENAME" "${line_precision}" "$precision" + func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size" + func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch" + gpu_id=$(set_gpu_id $device_num) + + if [ ${#gpu_id} -le 1 ];then + run_process_type="SingleP" + log_path="$SAVE_LOG/profiling_log" + mkdir -p $log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling" + func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id + # set profile_option params + tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` + + # run test_train_inference_python.sh + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + eval $cmd + eval "cat ${log_path}/${log_name}" + + # without profile + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit samples/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + else + IFS=";" + unset_env=`unset CUDA_VISIBLE_DEVICES` + run_process_type="MultiP" + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id + func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit images/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + fi + done + done +done diff --git a/test_tipc/configs/dnn/train_infer_python.txt b/test_tipc/configs/dnn/train_infer_python.txt new file mode 100755 index 000000000..b6b7dd591 --- /dev/null +++ b/test_tipc/configs/dnn/train_infer_python.txt @@ -0,0 +1,55 @@ +===========================train_params=========================== +model_name:dnn +python:python3.7 +gpu_list:0|-1 +runner.use_gpu:True|False +auto_cast:False +runner.epochs:lite_train_lite_infer=4|whole_train_whole_infer=4|whole_infer=4|lite_train_whole_infer=4 +runner.model_save_path +runner.train_batch_size:lite_train_lite_infer=50|whole_train_whole_infer=512|whole_infer=50|lite_train_whole_infer=50 +runner.infer_load_path:null +train_model_name:lite_train_lite_infer=3|whole_train_whole_infer=3|whole_infer=3|lite_train_whole_infer=3 +runner.test_data_dir:test_tipc/data/infer +runner.train_data_dir:../../../test_tipc/data/train +## +trainer:norm_train +norm_train:-u tools/trainer.py -m ./models/rank/dnn/config_bigdata.yaml -o runner.print_interval=2 +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +runner.model_save_path: +runner.model_init_path: +norm_export:-u tools/to_static.py -m ./models/rank/dnn/config_bigdata.yaml -o runner.CE=true +quant_export:null +fpgm_export:null +distill_export:null +null:null +null:null +## +infer_model:test_tipc/save_dnn_model +infer_export:null +infer_quant:False +inference:-u tools/paddle_infer.py --model_name=dnn --reader_file=models/rank/dnn/criteo_reader.py +--use_gpu:True|False +--enable_mkldnn:True|False +--cpu_threads:1|6 +--batchsize:10 +--enable_tensorRT:True|False +--precision:fp32 +--model_dir: +--data_dir:test_tipc/data/infer +--save_log_path:./test_tipc/output/ +--benchmark:True +null:null +===========================train_benchmark_params=========================== +batchsize:2048 +epoch:3 +--profiler_options="batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile" diff --git a/test_tipc/doc/benchmark_train.md b/test_tipc/doc/benchmark_train.md new file mode 100644 index 000000000..cc3b17c7c --- /dev/null +++ b/test_tipc/doc/benchmark_train.md @@ -0,0 +1,57 @@ +# TIPC Linux端Benchmark测试文档 + +该文档为Benchmark测试说明,Benchmark预测功能测试的主程序为`benchmark_train.sh`,用于验证监控模型训练的性能。 + +# 1. 测试流程 +## 1.1 准备数据和环境安装 +运行`test_tipc/prepare.sh`,完成训练数据准备和安装环境流程。 + +```shell +# 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode +bash test_tipc/prepare.sh test_tipc/configs/dnn/train_benchmark.txt benchmark_train +``` + +## 1.2 功能测试 +执行`test_tipc/benchmark_train.sh`,完成模型训练和日志解析 + +```shell +# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode +bash test_tipc/benchmark_train.sh test_tipc/configs/dnn/train_infer_python.txt benchmark_train +``` + +`test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下: +```shell +# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode +bash test_tipc/benchmark_train.sh test_tipc/configs/dnn/train_infer_python.txt benchmark_train +``` +dynamic_bs8_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下: +`${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}` +包含的信息有:模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡(N1C1)。 + + +## 2. 日志输出 +benchmark训练得到训练日志后,会自动保存训练日志并解析得到ips等信息, 在benchmark测试时,会自动调用{benchmark_root}/scrips/analysis.py + +BENCHMARK_ROOT 通过设置环境变量的方式来设置,比如: +``` +export BENCHMARK_ROOT=/paddle/PaddleRec/test_tipc +benchmark_train.sh在运行时会自动调用/paddle/PaddleRec/test_tipc/scripts/analysis.py +``` +运行后将保存模型的训练日志和解析日志,使用 `test_tipc/configs/dnn/train_benchmark.txt` 参数文件的训练日志解析结果是: + +``` +{"model_branch": "gpups", "model_commit": "2ccd243761b39dffe037cef5160dda722f121311", "model_name": "dnn_bs2048_3_MultiP_DP", "batch_size": 2048, "fp_item": "3", "run_mode": "DP", "convergence_value": 0, "convergence_key": "loss:", "ips": 0, "speed_unit": "", "device_num": "N1C4", "model_run_time": "0", "frame_commit": "360b8383250774108a6561e7071d60189b0d0964", "frame_version": "0.0.0"} +``` + +训练日志和日志解析结果保存在benchmark_log目录下,文件组织格式如下: +``` +train_log/ +├── index +│ ├── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C1_speed +│ └── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C4_speed +├── profiling_log +│ └── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C1_profiling +└── train_log + ├── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C1_log + └── PaddleOCR_det_mv3_db_v2_0_bs8_fp32_SingleP_DP_N1C4_log +``` diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh old mode 100644 new mode 100755 index c4843e819..eeac2648d --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -18,7 +18,42 @@ model_name=$(func_parser_value "${lines[1]}") rm -rf ./test_tipc/data rm -rf ./test_tipc/output -if [ ${model_name} == "wide_deep" ]; then +if [ ${model_name} == "dnn" ]; then + # prepare pretrained weights and dataset + wget -nc -P ./test_tipc/save_dnn_model https://paddlerec.bj.bcebos.com/wide_deep/wide_deep.tar + cd test_tipc/save_dnn_model && tar -xvf wide_deep.tar && rm -rf wide_deep.tar && cd ../../ + + mkdir -p ./test_tipc/data/train + mkdir -p ./test_tipc/data/infer + if [ ${MODE} = "lite_train_lite_infer" ];then + cp -r ./models/rank/dnn/data/sample_data/train/* ./test_tipc/data/train + cp -r ./models/rank/dnn/data/sample_data/train/* ./test_tipc/data/infer + echo "demo data ready" + elif [ ${MODE} = "whole_train_whole_infer" ];then + cd ./datasets/criteo + bash run.sh + cd ../.. + cp -r ./datasets/criteo/slot_train_data_full/* ./test_tipc/data/train + cp -r ./datasets/criteo/slot_test_data_full/* ./test_tipc/data/infer + echo "whole data ready" + elif [ ${MODE} = "whole_infer" ];then + cd ./datasets/criteo + bash run.sh + cd ../.. + cp -r ./models/rank/dnn/data/sample_data/train/* ./test_tipc/data/train + cp -r ./datasets/criteo/slot_test_data_full/* ./test_tipc/data/infer + elif [ ${MODE} = "lite_train_whole_infer" ];then + cd ./datasets/criteo + bash run.sh + cd ../.. + cp -r ./models/rank/dnn/data/sample_data/train/* ./test_tipc/data/train + cp -r ./datasets/criteo/slot_test_data_full/* ./test_tipc/data/infer + elif [ ${MODE} = "benchmark_train" ];then + cp -r ./models/rank/dnn/data/sample_data/train/* ./test_tipc/data/train + echo "demo data ready" + fi + +elif [ ${model_name} == "wide_deep" ]; then # prepare pretrained weights and dataset wget -nc -P ./test_tipc/save_wide_deep_model https://paddlerec.bj.bcebos.com/wide_deep/wide_deep.tar cd test_tipc/save_wide_deep_model && tar -xvf wide_deep.tar && rm -rf wide_deep.tar && cd ../../ @@ -366,35 +401,6 @@ elif [ ${model_name} == "sign" ]; then cp -r ./models/rank/sign/data/* ./test_tipc/data/train cp -r ./datasets/sign/test/* ./test_tipc/data/infer fi -elif [ ${model_name} == "fgcnn" ]; then - rm -rf ./test_tipc/data/* - mkdir -p ./test_tipc/data/train - mkdir -p ./test_tipc/data/infer - if [ ${MODE} = "lite_train_lite_infer" ];then - cp -r ./models/rank/fgcnn/data/trainlite/* ./test_tipc/data/train - cp -r ./models/rank/fgcnn/data/testlite/* ./test_tipc/data/infer - echo "demo data ready" - elif [ ${MODE} = "whole_train_whole_infer" ];then - cd ./datasets/criteo_fgcnn - bash run.sh - cd ../.. - cp -r ./datasets/criteo_fgcnn/train/train.h5 ./test_tipc/data/train - cp -r ./datasets/criteo_fgcnn/test/valid.h5 ./test_tipc/data/infer - echo "whole data ready" - elif [ ${MODE} = "whole_infer" ];then - cd ./datasets/criteo_fgcnn - bash run.sh - cd ../.. - cp -r ./datasets/criteo_fgcnn/train/train.h5 ./test_tipc/data/train - cp -r ./datasets/criteo_fgcnn/test/valid.h5 ./test_tipc/data/infer - echo "whole data ready" - elif [ ${MODE} = "lite_train_whole_infer" ];then - cd ./datasets/criteo_fgcnn - bash run.sh - cd ../.. - cp -r ./models/rank/fgcnn/data/trainlite/* ./test_tipc/data/train - cp -r ./datasets/criteo_fgcnn/test/valid.h5 ./test_tipc/data/infer - fi elif [ ${model_name} == "iprec" ]; then mkdir -p ./test_tipc/data/train mkdir -p ./test_tipc/data/infer @@ -422,29 +428,4 @@ elif [ ${model_name} == "iprec" ]; then cp -r ./datasets/iprec/whole_data/train/* ./test_tipc/data/train cp -r ./datasets/iprec/whole_data/test/* ./test_tipc/data/infer fi -elif [ ${model_name} == "kim" ]; then - rm -rf ./test_tipc/data/* - mkdir -p ./test_tipc/data/train - if [ ${MODE} = "lite_train_lite_infer" ];then - cp -r ./models/match/kim/data/sample_data/* ./test_tipc/data/train - echo "demo data ready" - elif [ ${MODE} = "whole_train_whole_infer" ];then - cd ./datasets/kim - bash run.sh - cd ../.. - cp -r ./datasets/kim/data/whole_data/* ./test_tipc/data/train - echo "whole data ready" - elif [ ${MODE} = "whole_infer" ];then - cd ./datasets/kim - bash run.sh - cd ../.. - cp -r ./datasets/kim/data/whole_data/* ./test_tipc/data/train - echo "whole data ready" - elif [ ${MODE} = "lite_train_whole_infer" ];then - cd ./datasets/kim - bash run.sh - cd ../.. - cp -r ./datasets/kim/data/whole_data/* ./test_tipc/data/train - echo "whole data ready" - fi fi diff --git a/test_tipc/scripts/analysis.py b/test_tipc/scripts/analysis.py new file mode 100644 index 000000000..9a3aae1fe --- /dev/null +++ b/test_tipc/scripts/analysis.py @@ -0,0 +1,300 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import argparse +import json +import os +import re +import traceback + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--filename", type=str, help="The name of log which need to analysis.") + parser.add_argument( + "--speed_log_file", type=str, help="json file") + parser.add_argument( + "--log_with_profiler", type=str, help="The path of train log with profiler") + parser.add_argument( + "--profiler_path", type=str, help="The path of profiler timeline log.") + parser.add_argument( + "--keyword", type=str, help="Keyword to specify analysis data") + parser.add_argument( + "--separator", type=str, default=None, help="Separator of different field in log") + parser.add_argument( + '--position', type=int, default=None, help='The position of data field') + parser.add_argument( + '--range', type=str, default="", help='The range of data field to intercept') + parser.add_argument( + '--skip_steps', type=int, default=0, help='The number of steps to be skipped') + parser.add_argument( + '--model_mode', type=int, default=-1, help='Analysis mode, default value is -1') + + parser.add_argument( + '--model_name', type=str, default="model_name", help='training model_name, transformer_base') + parser.add_argument( + '--base_batch_size', type=int, help='base_batch size on gpu') + parser.add_argument( + '--fp_item', type=str, help='fp_item:fp16|fp32') + parser.add_argument( + '--run_mode', type=str, default="DP", help='DP|MP|PP') + parser.add_argument( + '--convergence_key', type=str, default="", help="Keyword to specify loss data") + parser.add_argument( + '--speed_unit', type=str, default="images/s", help='IPS unit') + parser.add_argument( + '--device_num', type=str, default='N1C1', help='device_num:N1C1|N1C8|N4C32') + args = parser.parse_args() + args.separator = None if args.separator == "None" else args.separator + return args + + +def _is_number(num): + pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') + result = pattern.match(num) + if result: + return True + else: + return False + + +class TimeAnalyzer(object): + def __init__(self, filename, keyword=None, separator=None, position=None, range="-1"): + if filename is None: + raise Exception("Please specify the filename!") + + if keyword is None: + raise Exception("Please specify the keyword!") + + self.filename = filename + self.keyword = keyword + self.separator = separator + self.position = position + self.range = range + self.records = None + self._distil() + + def _distil(self): + self.records = [] + with open(self.filename, "r") as f_object: + lines = f_object.readlines() + for line in lines: + if self.keyword not in line: + continue + try: + result = None + + # Distil the string from a line. + line = line.strip() + line_words = line.split(self.separator) if self.separator else line.split() + if args.position: + result = line_words[self.position] + else: + # Distil the string following the keyword. + for i in range(len(line_words) - 1): + if line_words[i] == self.keyword: + result = line_words[i + 1] + break + + # Distil the result from the picked string. + if not self.range: + result = result[0:] + elif _is_number(self.range): + result = result[0: int(self.range)] + else: + result = result[int(self.range.split(":")[0]): int(self.range.split(":")[1])] + self.records.append(float(result)) + except Exception as exc: + print("line is: {}; separator={}; position={}".format(line, self.separator, self.position)) + + print("Extract {} records: separator={}; position={}".format(len(self.records), self.separator, self.position)) + + def _get_fps(self, mode, base_batch_size, gpu_num, avg_of_records, unit=None): + if mode == -1 : + assert unit, "Please set the unit when mode is -1." + fps = gpu_num * avg_of_records + elif mode == 0: + # s/step -> samples/s + fps = (base_batch_size * gpu_num) / avg_of_records + unit = "samples/s" + elif mode == 1: + # steps/s -> steps/s + fps = avg_of_records + unit = "steps/s" + elif mode == 2: + # s/step -> steps/s + fps = 1 / avg_of_records + unit = "steps/s" + elif mode == 3: + # steps/s -> samples/s + fps = base_batch_size * gpu_num * avg_of_records + unit = "samples/s" + elif mode == 4: + # s/epoch -> s/epoch + fps = avg_of_records + unit = "s/epoch" + else: + ValueError("Unsupported analysis mode.") + + return fps, unit + + def analysis(self, base_batch_size, gpu_num=1, skip_steps=0, mode=-1, unit=None): + if base_batch_size <= 0: + print("base_batch_size should larger than 0.") + return 0, '' + + if len(self.records) <= skip_steps: # to address the condition which item of log equals to skip_steps + print("no records") + return 0, '' + + sum_of_records = 0 + sum_of_records_skipped = 0 + skip_min = self.records[skip_steps] + skip_max = self.records[skip_steps] + + count = len(self.records) + for i in range(count): + sum_of_records += self.records[i] + if i >= skip_steps: + sum_of_records_skipped += self.records[i] + if self.records[i] < skip_min: + skip_min = self.records[i] + if self.records[i] > skip_max: + skip_max = self.records[i] + + avg_of_records = sum_of_records / float(count) + avg_of_records_skipped = sum_of_records_skipped / float(count - skip_steps) + + fps, fps_unit = self._get_fps(mode, base_batch_size, gpu_num, avg_of_records, unit) + fps_skipped, _ = self._get_fps(mode, base_batch_size, gpu_num, avg_of_records_skipped, unit) + if mode == -1: + print("average ips of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f %s" % (avg_of_records, fps_unit)) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average ips of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit)) + print("\tMin: %.3f %s" % (skip_min, fps_unit)) + print("\tMax: %.3f %s" % (skip_max, fps_unit)) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + elif mode == 1 or mode == 3: + print("average latency of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f steps/s" % avg_of_records) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average latency of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f steps/s" % avg_of_records_skipped) + print("\tMin: %.3f steps/s" % skip_min) + print("\tMax: %.3f steps/s" % skip_max) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + elif mode == 0 or mode == 2: + print("average latency of %d steps, skip 0 step:" % count) + print("\tAvg: %.3f s/step" % avg_of_records) + print("\tFPS: %.3f %s" % (fps, fps_unit)) + if skip_steps > 0: + print("average latency of %d steps, skip %d steps:" % (count, skip_steps)) + print("\tAvg: %.3f s/step" % avg_of_records_skipped) + print("\tMin: %.3f s/step" % skip_min) + print("\tMax: %.3f s/step" % skip_max) + print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) + + return round(fps_skipped, 3), fps_unit + + +class ExceptionTest(Exception): + pass + + +class LossAnalyzer(object): + def __init__(self, filename, convergence_key=None, separator=None): + if filename is None: + raise Exception("Please specify the filename!") + if convergence_key is None: + raise Exception("Please specify the keyword of loss!") + self.filename = filename + self.convergence_key = convergence_key + self.separator = separator + + def get_loss(self): + with open(self.filename, "r") as f_object: + lines = f_object.readlines() + lines.reverse() + result_loss = 0 + for line in lines: + if self.convergence_key not in line: + continue + try: + result_loss = 0 + line = line.strip() + line_words = line.split(self.separator) if self.separator else line.split() + for i in range(len(line_words) - 1): + if line_words[i] == self.convergence_key: + result_loss = line_words[i + 1] + result_loss = result_loss.replace(',', '') + raise ExceptionTest() + except ExceptionTest: + break + print("\tLoss: {}".format(result_loss)) + return result_loss + + +if __name__ == "__main__": + args = parse_args() + run_info = dict() + run_info["model_branch"] = os.getenv("model_branch") + run_info["model_commit"] = os.getenv("model_commit") + run_info["model_name"] = args.model_name + run_info["batch_size"] = args.base_batch_size + run_info["fp_item"] = args.fp_item + if re.match(r'DP.-MP.-PP.', args.run_mode) or 'DP_MoE_C' in args.run_mode: + run_info["run_mode"] = 'Collective' + else: + run_info["run_mode"] = args.run_mode + run_info["convergence_value"] = 0 + run_info["convergence_key"] = args.convergence_key + run_info["ips"] = 0 + run_info["speed_unit"] = args.speed_unit + run_info["device_num"] = args.device_num + run_info["model_run_time"] = os.getenv('model_run_time') + run_info["frame_commit"] = os.getenv('frame_commit') + run_info["frame_version"] = os.getenv('frame_version') + device_num = args.device_num + print("---device_num:-", device_num) + index_c = device_num.index('C') + print("---index_c:-", index_c) + gpu_num = int(device_num[index_c + 1:len(device_num)]) + print("-----gpu_num:", gpu_num) + if "pwgan" in args.model_name: + print("------analysis ", args.model_name) + args.keyword="avg_ips:" + + try: + analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, args.position, args.range) + run_info["ips"], run_info["speed_unit"] = analyzer.analysis( + base_batch_size=args.base_batch_size, + gpu_num=gpu_num, + skip_steps=args.skip_steps, + mode=args.model_mode, + unit=args.speed_unit) + if args.convergence_key != "": + loss_analyzer = LossAnalyzer(args.filename, args.convergence_key) + run_info["convergence_value"] = loss_analyzer.get_loss() + except Exception: + traceback.print_exc() + print("{}".format(json.dumps(run_info))) # it's required, for the log file path insert to the database + with open(args.speed_log_file, "w") as f: + f.write(json.dumps(run_info)) diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh old mode 100644 new mode 100755 index 56c6ff7e7..bc13ddf5d --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -205,6 +205,48 @@ function func_inference(){ done } +if [ ${MODE} = "benchmark_train" ]; then + if [ ! -d "./log" ]; then + mkdir ./log + echo "Create log floder for store running log" + fi + + export FLAGS_LAUNCH_BARRIER=0 + export PADDLE_TRAINER_ID=0 + export PADDLE_PSERVER_NUMS=1 + export PADDLE_TRAINERS=1 + export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS} + export POD_IP=127.0.0.1 + + # set free port if 29011 is occupied + export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:29011" + export PADDLE_PSERVER_PORT_ARRAY=(29011) + + # set gpu numbers according to your device + export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" + + # set your model yaml + SC="tools/static_gpubox_trainer.py -m models/rank/dnn/config_gpubox.yaml" + + # run pserver + export TRAINING_ROLE=PSERVER + for((i=0;i<$PADDLE_PSERVER_NUMS;i++)) + do + cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]} + echo "PADDLE WILL START PSERVER "$cur_port + export PADDLE_PORT=${cur_port} + python3.7 -u $SC &> ./log/pserver.$i.log & + done + + # run trainer + export TRAINING_ROLE=TRAINER + for((i=0;i<$PADDLE_TRAINERS;i++)) + do + echo "PADDLE WILL START Trainer "$i + export PADDLE_TRAINER_ID=$i + python3.7 -u $SC &> ./log/worker.$i.log + done +fi if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then GPUID=$3 if [ ${#GPUID} -le 0 ];then @@ -324,15 +366,43 @@ else set_save_model=$(func_set_params "${save_model_key}" "${save_log}") if [ ${#gpu} -le 2 ];then # train with cpu or single gpu cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " + eval "unset CUDA_VISIBLE_DEVICES" + eval $cmd + status_check $? "${cmd}" "${status_log}" + elif [ ${#ips} -le 26 ];then # train with multi-gpu - cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + # run pserver + export TRAINING_ROLE=PSERVER + for((i=0;i<$PADDLE_PSERVER_NUMS;i++)) + do + cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]} + echo "PADDLE WILL START PSERVER "$cur_port + export PADDLE_PORT=${cur_port} + cmd="${python} ${SC} &> ./log/pserver.$i.log &" + eval "unset CUDA_VISIBLE_DEVICES" + eval $cmd + status_check $? "${cmd}" "${status_log}" + done + + # run trainer + export TRAINING_ROLE=TRAINER + for((i=0;i<$PADDLE_TRAINERS;i++)) + do + echo "PADDLE WILL START Trainer "$i + export PADDLE_TRAINER_ID=$i + cmd="${python} ${SC} &> ./log/worker.$i.log &" + eval "unset CUDA_VISIBLE_DEVICES" + eval $cmd + status_check $? "${cmd}" "${status_log}" + done else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + eval "unset CUDA_VISIBLE_DEVICES" + eval $cmd + status_check $? "${cmd}" "${status_log}" + fi # run train - eval "unset CUDA_VISIBLE_DEVICES" - eval $cmd - status_check $? "${cmd}" "${status_log}" set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}") # save norm trained models to set pretrain for pact training and fpgm training diff --git a/tools/profiler.py b/tools/profiler.py new file mode 100644 index 000000000..c4e28bc6b --- /dev/null +++ b/tools/profiler.py @@ -0,0 +1,110 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler( + _profiler_options['state'], _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/tools/static_gpubox_trainer.py b/tools/static_gpubox_trainer.py index 7579195ad..fb0030d08 100755 --- a/tools/static_gpubox_trainer.py +++ b/tools/static_gpubox_trainer.py @@ -27,6 +27,7 @@ import warnings import logging from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil +import profiler fleet_util = FleetUtil() __dir__ = os.path.dirname(os.path.abspath(__file__)) @@ -45,12 +46,19 @@ def parse_args(): type=str, required=True, help='config file path') + parser.add_argument( + '--profiler_options', + type=str, + default=None, + help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".' + ) args = parser.parse_args() args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml)) yaml_helper = YamlHelper() config = yaml_helper.load_yaml(args.config_yaml) config["yaml_path"] = args.config_yaml config["config_abs_dir"] = args.abs_dir + config["profiler_options"] = args.profiler_options yaml_helper.print_yaml(config) return config @@ -59,6 +67,7 @@ class Main(object): def __init__(self, config): self.metrics = {} self.config = config + self.profiler_options = config.get("profiler_options") self.input_data = None self.reader = None self.exe = None @@ -221,6 +230,7 @@ def dataset_train_loop(self, epoch): ] fetch_vars = [var for _, var in self.metrics.items()] print_step = int(config.get("runner.print_interval")) + profiler.add_profiler_step(self.profiler_options) self.exe.train_from_dataset( program=paddle.static.default_main_program(), dataset=self.reader, @@ -235,6 +245,7 @@ def dataloader_train_loop(self, epoch): while True: try: train_start = time.time() + profiler.add_profiler_step(self.profiler_options) # --------------------------------------------------- # fetch_var = self.exe.run( program=paddle.static.default_main_program(), @@ -280,6 +291,7 @@ def recdataset_train_loop(self, epoch): for batch_id, batch_data in enumerate(self.reader()): train_reader_cost += time.time() - reader_start train_start = time.time() + profiler.add_profiler_step(self.profiler_options) # --------------------------------------------------- # fetch_batch_var = self.exe.run( program=paddle.static.default_main_program(), @@ -325,6 +337,7 @@ def heter_train_loop(self, epoch): while True: try: train_start = time.time() + profiler.add_profiler_step(self.profiler_options) # --------------------------------------------------- # self.exe.run(program=paddle.static.default_main_program()) # --------------------------------------------------- # From 7885ab91c6eabee4ac2985161b7249aaf202c0dd Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Wed, 15 Jun 2022 12:50:08 +0000 Subject: [PATCH 02/10] add gpups_1n1c --- test_tipc/prepare.sh | 54 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index eeac2648d..07adbaf86 100755 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -428,4 +428,58 @@ elif [ ${model_name} == "iprec" ]; then cp -r ./datasets/iprec/whole_data/train/* ./test_tipc/data/train cp -r ./datasets/iprec/whole_data/test/* ./test_tipc/data/infer fi +elif [ ${model_name} == "kim" ]; then + rm -rf ./test_tipc/data/* + mkdir -p ./test_tipc/data/train + if [ ${MODE} = "lite_train_lite_infer" ];then + cp -r ./models/match/kim/data/sample_data/* ./test_tipc/data/train + echo "demo data ready" + elif [ ${MODE} = "whole_train_whole_infer" ];then + cd ./datasets/kim + bash run.sh + cd ../.. + cp -r ./datasets/kim/data/whole_data/* ./test_tipc/data/train + echo "whole data ready" + elif [ ${MODE} = "whole_infer" ];then + cd ./datasets/kim + bash run.sh + cd ../.. + cp -r ./datasets/kim/data/whole_data/* ./test_tipc/data/train + echo "whole data ready" + elif [ ${MODE} = "lite_train_whole_infer" ];then + cd ./datasets/kim + bash run.sh + cd ../.. + cp -r ./datasets/kim/data/whole_data/* ./test_tipc/data/train + echo "whole data ready" + fi +elif [ ${model_name} == "fgcnn" ]; then + rm -rf ./test_tipc/data/* + mkdir -p ./test_tipc/data/train + mkdir -p ./test_tipc/data/infer + if [ ${MODE} = "lite_train_lite_infer" ];then + cp -r ./models/rank/fgcnn/data/trainlite/* ./test_tipc/data/train + cp -r ./models/rank/fgcnn/data/testlite/* ./test_tipc/data/infer + echo "demo data ready" + elif [ ${MODE} = "whole_train_whole_infer" ];then + cd ./datasets/criteo_fgcnn + bash run.sh + cd ../.. + cp -r ./datasets/criteo_fgcnn/train/train.h5 ./test_tipc/data/train + cp -r ./datasets/criteo_fgcnn/test/valid.h5 ./test_tipc/data/infer + echo "whole data ready" + elif [ ${MODE} = "whole_infer" ];then + cd ./datasets/criteo_fgcnn + bash run.sh + cd ../.. + cp -r ./datasets/criteo_fgcnn/train/train.h5 ./test_tipc/data/train + cp -r ./datasets/criteo_fgcnn/test/valid.h5 ./test_tipc/data/infer + echo "whole data ready" + elif [ ${MODE} = "lite_train_whole_infer" ];then + cd ./datasets/criteo_fgcnn + bash run.sh + cd ../.. + cp -r ./models/rank/fgcnn/data/trainlite/* ./test_tipc/data/train + cp -r ./datasets/criteo_fgcnn/test/valid.h5 ./test_tipc/data/infer + fi fi From 6308f2e987408425b5b27e6ab5f728654614e6b5 Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Wed, 22 Jun 2022 11:46:23 +0000 Subject: [PATCH 03/10] fix log sytle --- test_tipc/scripts/analysis.py | 127 ++++++++++++++++------- test_tipc/test_train_inference_python.sh | 8 +- tools/static_gpubox_trainer.py | 2 +- 3 files changed, 95 insertions(+), 42 deletions(-) diff --git a/test_tipc/scripts/analysis.py b/test_tipc/scripts/analysis.py index 9a3aae1fe..d17bdf8d7 100644 --- a/test_tipc/scripts/analysis.py +++ b/test_tipc/scripts/analysis.py @@ -25,39 +25,62 @@ def parse_args(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--filename", type=str, help="The name of log which need to analysis.") + parser.add_argument("--speed_log_file", type=str, help="json file") parser.add_argument( - "--speed_log_file", type=str, help="json file") - parser.add_argument( - "--log_with_profiler", type=str, help="The path of train log with profiler") + "--log_with_profiler", + type=str, + help="The path of train log with profiler") parser.add_argument( "--profiler_path", type=str, help="The path of profiler timeline log.") parser.add_argument( "--keyword", type=str, help="Keyword to specify analysis data") parser.add_argument( - "--separator", type=str, default=None, help="Separator of different field in log") + "--separator", + type=str, + default=None, + help="Separator of different field in log") parser.add_argument( - '--position', type=int, default=None, help='The position of data field') + '--position', + type=int, + default=None, + help='The position of data field') parser.add_argument( - '--range', type=str, default="", help='The range of data field to intercept') + '--range', + type=str, + default="", + help='The range of data field to intercept') parser.add_argument( - '--skip_steps', type=int, default=0, help='The number of steps to be skipped') + '--skip_steps', + type=int, + default=0, + help='The number of steps to be skipped') parser.add_argument( - '--model_mode', type=int, default=-1, help='Analysis mode, default value is -1') + '--model_mode', + type=int, + default=-1, + help='Analysis mode, default value is -1') parser.add_argument( - '--model_name', type=str, default="model_name", help='training model_name, transformer_base') + '--model_name', + type=str, + default="model_name", + help='training model_name, transformer_base') parser.add_argument( '--base_batch_size', type=int, help='base_batch size on gpu') + parser.add_argument('--fp_item', type=str, help='fp_item:fp16|fp32') + parser.add_argument('--run_mode', type=str, default="DP", help='DP|MP|PP') parser.add_argument( - '--fp_item', type=str, help='fp_item:fp16|fp32') - parser.add_argument( - '--run_mode', type=str, default="DP", help='DP|MP|PP') - parser.add_argument( - '--convergence_key', type=str, default="", help="Keyword to specify loss data") + '--convergence_key', + type=str, + default="", + help="Keyword to specify loss data") parser.add_argument( '--speed_unit', type=str, default="images/s", help='IPS unit') parser.add_argument( - '--device_num', type=str, default='N1C1', help='device_num:N1C1|N1C8|N4C32') + '--device_num', + type=str, + default='N1C1', + help='device_num:N1C1|N1C8|N4C32') args = parser.parse_args() args.separator = None if args.separator == "None" else args.separator return args @@ -73,7 +96,12 @@ def _is_number(num): class TimeAnalyzer(object): - def __init__(self, filename, keyword=None, separator=None, position=None, range="-1"): + def __init__(self, + filename, + keyword=None, + separator=None, + position=None, + range="-1"): if filename is None: raise Exception("Please specify the filename!") @@ -100,7 +128,8 @@ def _distil(self): # Distil the string from a line. line = line.strip() - line_words = line.split(self.separator) if self.separator else line.split() + line_words = line.split( + self.separator) if self.separator else line.split() if args.position: result = line_words[self.position] else: @@ -114,17 +143,25 @@ def _distil(self): if not self.range: result = result[0:] elif _is_number(self.range): - result = result[0: int(self.range)] + result = result[0:int(self.range)] else: - result = result[int(self.range.split(":")[0]): int(self.range.split(":")[1])] + result = result[int(self.range.split(":")[0]):int( + self.range.split(":")[1])] self.records.append(float(result)) except Exception as exc: - print("line is: {}; separator={}; position={}".format(line, self.separator, self.position)) - - print("Extract {} records: separator={}; position={}".format(len(self.records), self.separator, self.position)) - - def _get_fps(self, mode, base_batch_size, gpu_num, avg_of_records, unit=None): - if mode == -1 : + print("line is: {}; separator={}; position={}".format( + line, self.separator, self.position)) + + print("Extract {} records: separator={}; position={}".format( + len(self.records), self.separator, self.position)) + + def _get_fps(self, + mode, + base_batch_size, + gpu_num, + avg_of_records, + unit=None): + if mode == -1: assert unit, "Please set the unit when mode is -1." fps = gpu_num * avg_of_records elif mode == 0: @@ -152,12 +189,19 @@ def _get_fps(self, mode, base_batch_size, gpu_num, avg_of_records, unit=None): return fps, unit - def analysis(self, base_batch_size, gpu_num=1, skip_steps=0, mode=-1, unit=None): + def analysis(self, + base_batch_size, + gpu_num=1, + skip_steps=0, + mode=-1, + unit=None): if base_batch_size <= 0: print("base_batch_size should larger than 0.") return 0, '' - if len(self.records) <= skip_steps: # to address the condition which item of log equals to skip_steps + if len( + self.records + ) <= skip_steps: # to address the condition which item of log equals to skip_steps print("no records") return 0, '' @@ -177,16 +221,20 @@ def analysis(self, base_batch_size, gpu_num=1, skip_steps=0, mode=-1, unit=None) skip_max = self.records[i] avg_of_records = sum_of_records / float(count) - avg_of_records_skipped = sum_of_records_skipped / float(count - skip_steps) + avg_of_records_skipped = sum_of_records_skipped / float(count - + skip_steps) - fps, fps_unit = self._get_fps(mode, base_batch_size, gpu_num, avg_of_records, unit) - fps_skipped, _ = self._get_fps(mode, base_batch_size, gpu_num, avg_of_records_skipped, unit) + fps, fps_unit = self._get_fps(mode, base_batch_size, gpu_num, + avg_of_records, unit) + fps_skipped, _ = self._get_fps(mode, base_batch_size, gpu_num, + avg_of_records_skipped, unit) if mode == -1: print("average ips of %d steps, skip 0 step:" % count) print("\tAvg: %.3f %s" % (avg_of_records, fps_unit)) print("\tFPS: %.3f %s" % (fps, fps_unit)) if skip_steps > 0: - print("average ips of %d steps, skip %d steps:" % (count, skip_steps)) + print("average ips of %d steps, skip %d steps:" % + (count, skip_steps)) print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit)) print("\tMin: %.3f %s" % (skip_min, fps_unit)) print("\tMax: %.3f %s" % (skip_max, fps_unit)) @@ -196,7 +244,8 @@ def analysis(self, base_batch_size, gpu_num=1, skip_steps=0, mode=-1, unit=None) print("\tAvg: %.3f steps/s" % avg_of_records) print("\tFPS: %.3f %s" % (fps, fps_unit)) if skip_steps > 0: - print("average latency of %d steps, skip %d steps:" % (count, skip_steps)) + print("average latency of %d steps, skip %d steps:" % + (count, skip_steps)) print("\tAvg: %.3f steps/s" % avg_of_records_skipped) print("\tMin: %.3f steps/s" % skip_min) print("\tMax: %.3f steps/s" % skip_max) @@ -206,7 +255,8 @@ def analysis(self, base_batch_size, gpu_num=1, skip_steps=0, mode=-1, unit=None) print("\tAvg: %.3f s/step" % avg_of_records) print("\tFPS: %.3f %s" % (fps, fps_unit)) if skip_steps > 0: - print("average latency of %d steps, skip %d steps:" % (count, skip_steps)) + print("average latency of %d steps, skip %d steps:" % + (count, skip_steps)) print("\tAvg: %.3f s/step" % avg_of_records_skipped) print("\tMin: %.3f s/step" % skip_min) print("\tMax: %.3f s/step" % skip_max) @@ -240,7 +290,8 @@ def get_loss(self): try: result_loss = 0 line = line.strip() - line_words = line.split(self.separator) if self.separator else line.split() + line_words = line.split( + self.separator) if self.separator else line.split() for i in range(len(line_words) - 1): if line_words[i] == self.convergence_key: result_loss = line_words[i + 1] @@ -280,10 +331,11 @@ def get_loss(self): print("-----gpu_num:", gpu_num) if "pwgan" in args.model_name: print("------analysis ", args.model_name) - args.keyword="avg_ips:" + args.keyword = "avg_ips:" try: - analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, args.position, args.range) + analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, + args.position, args.range) run_info["ips"], run_info["speed_unit"] = analyzer.analysis( base_batch_size=args.base_batch_size, gpu_num=gpu_num, @@ -295,6 +347,7 @@ def get_loss(self): run_info["convergence_value"] = loss_analyzer.get_loss() except Exception: traceback.print_exc() - print("{}".format(json.dumps(run_info))) # it's required, for the log file path insert to the database + print("{}".format(json.dumps(run_info)) + ) # it's required, for the log file path insert to the database with open(args.speed_log_file, "w") as f: f.write(json.dumps(run_info)) diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index bc13ddf5d..8ca3abf40 100755 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -235,7 +235,7 @@ if [ ${MODE} = "benchmark_train" ]; then cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]} echo "PADDLE WILL START PSERVER "$cur_port export PADDLE_PORT=${cur_port} - python3.7 -u $SC &> ./log/pserver.$i.log & + python3.7 -u $SC done # run trainer @@ -244,7 +244,7 @@ if [ ${MODE} = "benchmark_train" ]; then do echo "PADDLE WILL START Trainer "$i export PADDLE_TRAINER_ID=$i - python3.7 -u $SC &> ./log/worker.$i.log + python3.7 -u $SC done fi if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then @@ -378,7 +378,7 @@ else cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]} echo "PADDLE WILL START PSERVER "$cur_port export PADDLE_PORT=${cur_port} - cmd="${python} ${SC} &> ./log/pserver.$i.log &" + cmd="${python} ${SC}" eval "unset CUDA_VISIBLE_DEVICES" eval $cmd status_check $? "${cmd}" "${status_log}" @@ -390,7 +390,7 @@ else do echo "PADDLE WILL START Trainer "$i export PADDLE_TRAINER_ID=$i - cmd="${python} ${SC} &> ./log/worker.$i.log &" + cmd="${python} ${SC}" eval "unset CUDA_VISIBLE_DEVICES" eval $cmd status_check $? "${cmd}" "${status_log}" diff --git a/tools/static_gpubox_trainer.py b/tools/static_gpubox_trainer.py index fb0030d08..70ec79dfb 100755 --- a/tools/static_gpubox_trainer.py +++ b/tools/static_gpubox_trainer.py @@ -174,7 +174,7 @@ def run_worker(self): fleet_util.set_zero(self.model.batch_stat_neg.name, paddle.fluid.global_scope()) logger.info( - "Epoch: {}, using time {} second, ips {} {}/sec. auc: {}". + "Epoch: {}, using time: {} second, ips: {} {}/sec. auc: {}". format(epoch, epoch_time, epoch_speed, self.count_method, global_auc)) else: From bee9da66aee2d9461b953956c1407591af62b260 Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Wed, 22 Jun 2022 12:16:11 +0000 Subject: [PATCH 04/10] fix log sytle --- tools/static_gpubox_trainer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/static_gpubox_trainer.py b/tools/static_gpubox_trainer.py index 70ec79dfb..e9278d748 100755 --- a/tools/static_gpubox_trainer.py +++ b/tools/static_gpubox_trainer.py @@ -85,7 +85,7 @@ def run(self): elif fleet.is_worker(): self.run_worker() fleet.stop_worker() - self.record_result() + #self.record_result() logger.info("Run Success, Exit.") logger.info("-" * 100) @@ -111,12 +111,12 @@ def run_worker(self): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) - with open("./{}_worker_main_program.prototxt".format( - fleet.worker_index()), 'w+') as f: - f.write(str(paddle.static.default_main_program())) - with open("./{}_worker_startup_program.prototxt".format( - fleet.worker_index()), 'w+') as f: - f.write(str(paddle.static.default_startup_program())) + #with open("./{}_worker_main_program.prototxt".format( + # fleet.worker_index()), 'w+') as f: + # f.write(str(paddle.static.default_main_program())) + #with open("./{}_worker_startup_program.prototxt".format( + # fleet.worker_index()), 'w+') as f: + # f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() From 2e63c3034574239bd99dca2fb38fba71d8051bc7 Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Thu, 23 Jun 2022 07:44:19 +0000 Subject: [PATCH 05/10] updata v2.3 from qa --- test_tipc/doc/benchmark_train.md | 4 ++-- test_tipc/test_train_inference_python.sh | 14 +++++++----- tools/static_gpubox_trainer.py | 28 ++++++++++++++++++------ 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/test_tipc/doc/benchmark_train.md b/test_tipc/doc/benchmark_train.md index cc3b17c7c..4dde315e6 100644 --- a/test_tipc/doc/benchmark_train.md +++ b/test_tipc/doc/benchmark_train.md @@ -8,7 +8,7 @@ ```shell # 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode -bash test_tipc/prepare.sh test_tipc/configs/dnn/train_benchmark.txt benchmark_train +bash test_tipc/prepare.sh test_tipc/configs/dnn/train_infer_python.txt benchmark_train ``` ## 1.2 功能测试 @@ -22,7 +22,7 @@ bash test_tipc/benchmark_train.sh test_tipc/configs/dnn/train_infer_python.txt b `test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下: ```shell # 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode -bash test_tipc/benchmark_train.sh test_tipc/configs/dnn/train_infer_python.txt benchmark_train +bash test_tipc/benchmark_train.sh test_tipc/configs/dnn/train_infer_python.txt benchmark_train null_bs8_null_null_N1C8 ``` dynamic_bs8_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下: `${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}` diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index 8ca3abf40..4dfbb27ee 100755 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -223,11 +223,12 @@ if [ ${MODE} = "benchmark_train" ]; then export PADDLE_PSERVER_PORT_ARRAY=(29011) # set gpu numbers according to your device - export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" + #export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" + export FLAGS_selected_gpus=${gpu_list} # set your model yaml SC="tools/static_gpubox_trainer.py -m models/rank/dnn/config_gpubox.yaml" - + BATCH="-o runner.train_batch_size="$train_batch_value # run pserver export TRAINING_ROLE=PSERVER for((i=0;i<$PADDLE_PSERVER_NUMS;i++)) @@ -235,7 +236,8 @@ if [ ${MODE} = "benchmark_train" ]; then cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]} echo "PADDLE WILL START PSERVER "$cur_port export PADDLE_PORT=${cur_port} - python3.7 -u $SC + cmd="${python} ${SC} ${BATCH}" + eval $cmd done # run trainer @@ -244,10 +246,10 @@ if [ ${MODE} = "benchmark_train" ]; then do echo "PADDLE WILL START Trainer "$i export PADDLE_TRAINER_ID=$i - python3.7 -u $SC + cmd="${python} ${SC} ${BATCH}" + eval $cmd done -fi -if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then +elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then GPUID=$3 if [ ${#GPUID} -le 0 ];then env=" " diff --git a/tools/static_gpubox_trainer.py b/tools/static_gpubox_trainer.py index e9278d748..c7ba88bde 100755 --- a/tools/static_gpubox_trainer.py +++ b/tools/static_gpubox_trainer.py @@ -40,6 +40,7 @@ def parse_args(): parser = argparse.ArgumentParser("PaddleRec train script") + parser.add_argument("-o", "--opt", nargs='*', type=str) parser.add_argument( '-m', '--config_yaml', @@ -56,6 +57,19 @@ def parse_args(): args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml)) yaml_helper = YamlHelper() config = yaml_helper.load_yaml(args.config_yaml) + # modify config from command + if args.opt: + for parameter in args.opt: + parameter = parameter.strip() + key, value = parameter.split("=") + if type(config.get(key)) is int: + value = int(value) + if type(config.get(key)) is float: + value = float(value) + if type(config.get(key)) is bool: + value = (True if value.lower() == "true" else False) + config[key] = value + config["yaml_path"] = args.config_yaml config["config_abs_dir"] = args.abs_dir config["profiler_options"] = args.profiler_options @@ -85,7 +99,7 @@ def run(self): elif fleet.is_worker(): self.run_worker() fleet.stop_worker() - #self.record_result() + self.record_result() logger.info("Run Success, Exit.") logger.info("-" * 100) @@ -111,12 +125,12 @@ def run_worker(self): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) - #with open("./{}_worker_main_program.prototxt".format( - # fleet.worker_index()), 'w+') as f: - # f.write(str(paddle.static.default_main_program())) - #with open("./{}_worker_startup_program.prototxt".format( - # fleet.worker_index()), 'w+') as f: - # f.write(str(paddle.static.default_startup_program())) + with open("./{}_worker_main_program.prototxt".format( + fleet.worker_index()), 'w+') as f: + f.write(str(paddle.static.default_main_program())) + with open("./{}_worker_startup_program.prototxt".format( + fleet.worker_index()), 'w+') as f: + f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() From 0e2d7229983ced8a0833a531a62270285a3b49e3 Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Wed, 6 Jul 2022 02:28:44 +0000 Subject: [PATCH 06/10] update benchmark from qa --- test_tipc/benchmark_train.sh | 24 +- test_tipc/configs/dnn/train_infer_python.txt | 2 +- test_tipc/scripts/analysis.py | 353 ------------------- test_tipc/test_train_inference_python.sh | 5 +- 4 files changed, 13 insertions(+), 371 deletions(-) delete mode 100644 test_tipc/scripts/analysis.py diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index 84935c272..fa87f9f65 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -86,10 +86,8 @@ line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` # for train log parser batch_size=$(func_parser_value "${lines[line_num]}") line_num=`expr $line_num + 1` -fp_items=$(func_parser_value "${lines[line_num]}") -line_num=`expr $line_num + 1` epoch=$(func_parser_value "${lines[line_num]}") - +fp_items=$(func_parser_value "${lines[line_num]}") line_num=`expr $line_num + 1` profile_option_key=$(func_parser_key "${lines[line_num]}") profile_option_params=$(func_parser_value "${lines[line_num]}") @@ -146,10 +144,6 @@ else device_num=${params_list[4]} IFS=";" - if [ ${precision} = "null" ];then - precision="fp32" - fi - fp_items_list=($precision) batch_size_list=($batch_size) device_num_list=($device_num) @@ -169,7 +163,7 @@ for batch_size in ${batch_size_list[*]}; do run_process_type="SingleP" log_path="$SAVE_LOG/profiling_log" mkdir -p $log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling" func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id # set profile_option params tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` @@ -185,8 +179,8 @@ for batch_size in ${batch_size_list[*]}; do speed_log_path="$SAVE_LOG/index" mkdir -p $log_path mkdir -p $speed_log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" - speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " echo $cmd @@ -197,7 +191,7 @@ for batch_size in ${batch_size_list[*]}; do eval "cat ${log_path}/${log_name}" # parser log - _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ --speed_log_file '${speed_log_path}/${speed_log_name}' \ --model_name ${_model_name} \ @@ -221,8 +215,8 @@ for batch_size in ${batch_size_list[*]}; do speed_log_path="$SAVE_LOG/index" mkdir -p $log_path mkdir -p $speed_log_path - log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" - speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed" func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " @@ -233,7 +227,7 @@ for batch_size in ${batch_size_list[*]}; do export model_run_time=$((${job_et}-${job_bt})) eval "cat ${log_path}/${log_name}" # parser log - _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}" cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ --speed_log_file '${speed_log_path}/${speed_log_name}' \ @@ -244,7 +238,7 @@ for batch_size in ${batch_size_list[*]}; do --keyword ips: \ --skip_steps 2 \ --device_num ${device_num} \ - --speed_unit images/s \ + --speed_unit samples/s \ --convergence_key loss: " echo $cmd eval $cmd diff --git a/test_tipc/configs/dnn/train_infer_python.txt b/test_tipc/configs/dnn/train_infer_python.txt index b6b7dd591..c914570b9 100755 --- a/test_tipc/configs/dnn/train_infer_python.txt +++ b/test_tipc/configs/dnn/train_infer_python.txt @@ -51,5 +51,5 @@ inference:-u tools/paddle_infer.py --model_name=dnn --reader_file=models/rank/dn null:null ===========================train_benchmark_params=========================== batchsize:2048 -epoch:3 +epochs:6 --profiler_options="batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile" diff --git a/test_tipc/scripts/analysis.py b/test_tipc/scripts/analysis.py deleted file mode 100644 index d17bdf8d7..000000000 --- a/test_tipc/scripts/analysis.py +++ /dev/null @@ -1,353 +0,0 @@ -# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import argparse -import json -import os -import re -import traceback - - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--filename", type=str, help="The name of log which need to analysis.") - parser.add_argument("--speed_log_file", type=str, help="json file") - parser.add_argument( - "--log_with_profiler", - type=str, - help="The path of train log with profiler") - parser.add_argument( - "--profiler_path", type=str, help="The path of profiler timeline log.") - parser.add_argument( - "--keyword", type=str, help="Keyword to specify analysis data") - parser.add_argument( - "--separator", - type=str, - default=None, - help="Separator of different field in log") - parser.add_argument( - '--position', - type=int, - default=None, - help='The position of data field') - parser.add_argument( - '--range', - type=str, - default="", - help='The range of data field to intercept') - parser.add_argument( - '--skip_steps', - type=int, - default=0, - help='The number of steps to be skipped') - parser.add_argument( - '--model_mode', - type=int, - default=-1, - help='Analysis mode, default value is -1') - - parser.add_argument( - '--model_name', - type=str, - default="model_name", - help='training model_name, transformer_base') - parser.add_argument( - '--base_batch_size', type=int, help='base_batch size on gpu') - parser.add_argument('--fp_item', type=str, help='fp_item:fp16|fp32') - parser.add_argument('--run_mode', type=str, default="DP", help='DP|MP|PP') - parser.add_argument( - '--convergence_key', - type=str, - default="", - help="Keyword to specify loss data") - parser.add_argument( - '--speed_unit', type=str, default="images/s", help='IPS unit') - parser.add_argument( - '--device_num', - type=str, - default='N1C1', - help='device_num:N1C1|N1C8|N4C32') - args = parser.parse_args() - args.separator = None if args.separator == "None" else args.separator - return args - - -def _is_number(num): - pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') - result = pattern.match(num) - if result: - return True - else: - return False - - -class TimeAnalyzer(object): - def __init__(self, - filename, - keyword=None, - separator=None, - position=None, - range="-1"): - if filename is None: - raise Exception("Please specify the filename!") - - if keyword is None: - raise Exception("Please specify the keyword!") - - self.filename = filename - self.keyword = keyword - self.separator = separator - self.position = position - self.range = range - self.records = None - self._distil() - - def _distil(self): - self.records = [] - with open(self.filename, "r") as f_object: - lines = f_object.readlines() - for line in lines: - if self.keyword not in line: - continue - try: - result = None - - # Distil the string from a line. - line = line.strip() - line_words = line.split( - self.separator) if self.separator else line.split() - if args.position: - result = line_words[self.position] - else: - # Distil the string following the keyword. - for i in range(len(line_words) - 1): - if line_words[i] == self.keyword: - result = line_words[i + 1] - break - - # Distil the result from the picked string. - if not self.range: - result = result[0:] - elif _is_number(self.range): - result = result[0:int(self.range)] - else: - result = result[int(self.range.split(":")[0]):int( - self.range.split(":")[1])] - self.records.append(float(result)) - except Exception as exc: - print("line is: {}; separator={}; position={}".format( - line, self.separator, self.position)) - - print("Extract {} records: separator={}; position={}".format( - len(self.records), self.separator, self.position)) - - def _get_fps(self, - mode, - base_batch_size, - gpu_num, - avg_of_records, - unit=None): - if mode == -1: - assert unit, "Please set the unit when mode is -1." - fps = gpu_num * avg_of_records - elif mode == 0: - # s/step -> samples/s - fps = (base_batch_size * gpu_num) / avg_of_records - unit = "samples/s" - elif mode == 1: - # steps/s -> steps/s - fps = avg_of_records - unit = "steps/s" - elif mode == 2: - # s/step -> steps/s - fps = 1 / avg_of_records - unit = "steps/s" - elif mode == 3: - # steps/s -> samples/s - fps = base_batch_size * gpu_num * avg_of_records - unit = "samples/s" - elif mode == 4: - # s/epoch -> s/epoch - fps = avg_of_records - unit = "s/epoch" - else: - ValueError("Unsupported analysis mode.") - - return fps, unit - - def analysis(self, - base_batch_size, - gpu_num=1, - skip_steps=0, - mode=-1, - unit=None): - if base_batch_size <= 0: - print("base_batch_size should larger than 0.") - return 0, '' - - if len( - self.records - ) <= skip_steps: # to address the condition which item of log equals to skip_steps - print("no records") - return 0, '' - - sum_of_records = 0 - sum_of_records_skipped = 0 - skip_min = self.records[skip_steps] - skip_max = self.records[skip_steps] - - count = len(self.records) - for i in range(count): - sum_of_records += self.records[i] - if i >= skip_steps: - sum_of_records_skipped += self.records[i] - if self.records[i] < skip_min: - skip_min = self.records[i] - if self.records[i] > skip_max: - skip_max = self.records[i] - - avg_of_records = sum_of_records / float(count) - avg_of_records_skipped = sum_of_records_skipped / float(count - - skip_steps) - - fps, fps_unit = self._get_fps(mode, base_batch_size, gpu_num, - avg_of_records, unit) - fps_skipped, _ = self._get_fps(mode, base_batch_size, gpu_num, - avg_of_records_skipped, unit) - if mode == -1: - print("average ips of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f %s" % (avg_of_records, fps_unit)) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average ips of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit)) - print("\tMin: %.3f %s" % (skip_min, fps_unit)) - print("\tMax: %.3f %s" % (skip_max, fps_unit)) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - elif mode == 1 or mode == 3: - print("average latency of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f steps/s" % avg_of_records) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average latency of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f steps/s" % avg_of_records_skipped) - print("\tMin: %.3f steps/s" % skip_min) - print("\tMax: %.3f steps/s" % skip_max) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - elif mode == 0 or mode == 2: - print("average latency of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f s/step" % avg_of_records) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average latency of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f s/step" % avg_of_records_skipped) - print("\tMin: %.3f s/step" % skip_min) - print("\tMax: %.3f s/step" % skip_max) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - - return round(fps_skipped, 3), fps_unit - - -class ExceptionTest(Exception): - pass - - -class LossAnalyzer(object): - def __init__(self, filename, convergence_key=None, separator=None): - if filename is None: - raise Exception("Please specify the filename!") - if convergence_key is None: - raise Exception("Please specify the keyword of loss!") - self.filename = filename - self.convergence_key = convergence_key - self.separator = separator - - def get_loss(self): - with open(self.filename, "r") as f_object: - lines = f_object.readlines() - lines.reverse() - result_loss = 0 - for line in lines: - if self.convergence_key not in line: - continue - try: - result_loss = 0 - line = line.strip() - line_words = line.split( - self.separator) if self.separator else line.split() - for i in range(len(line_words) - 1): - if line_words[i] == self.convergence_key: - result_loss = line_words[i + 1] - result_loss = result_loss.replace(',', '') - raise ExceptionTest() - except ExceptionTest: - break - print("\tLoss: {}".format(result_loss)) - return result_loss - - -if __name__ == "__main__": - args = parse_args() - run_info = dict() - run_info["model_branch"] = os.getenv("model_branch") - run_info["model_commit"] = os.getenv("model_commit") - run_info["model_name"] = args.model_name - run_info["batch_size"] = args.base_batch_size - run_info["fp_item"] = args.fp_item - if re.match(r'DP.-MP.-PP.', args.run_mode) or 'DP_MoE_C' in args.run_mode: - run_info["run_mode"] = 'Collective' - else: - run_info["run_mode"] = args.run_mode - run_info["convergence_value"] = 0 - run_info["convergence_key"] = args.convergence_key - run_info["ips"] = 0 - run_info["speed_unit"] = args.speed_unit - run_info["device_num"] = args.device_num - run_info["model_run_time"] = os.getenv('model_run_time') - run_info["frame_commit"] = os.getenv('frame_commit') - run_info["frame_version"] = os.getenv('frame_version') - device_num = args.device_num - print("---device_num:-", device_num) - index_c = device_num.index('C') - print("---index_c:-", index_c) - gpu_num = int(device_num[index_c + 1:len(device_num)]) - print("-----gpu_num:", gpu_num) - if "pwgan" in args.model_name: - print("------analysis ", args.model_name) - args.keyword = "avg_ips:" - - try: - analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, - args.position, args.range) - run_info["ips"], run_info["speed_unit"] = analyzer.analysis( - base_batch_size=args.base_batch_size, - gpu_num=gpu_num, - skip_steps=args.skip_steps, - mode=args.model_mode, - unit=args.speed_unit) - if args.convergence_key != "": - loss_analyzer = LossAnalyzer(args.filename, args.convergence_key) - run_info["convergence_value"] = loss_analyzer.get_loss() - except Exception: - traceback.print_exc() - print("{}".format(json.dumps(run_info)) - ) # it's required, for the log file path insert to the database - with open(args.speed_log_file, "w") as f: - f.write(json.dumps(run_info)) diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index 4dfbb27ee..726030aab 100755 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -229,6 +229,7 @@ if [ ${MODE} = "benchmark_train" ]; then # set your model yaml SC="tools/static_gpubox_trainer.py -m models/rank/dnn/config_gpubox.yaml" BATCH="-o runner.train_batch_size="$train_batch_value + EPOCH="-o runner.epochs="$epoch_num # run pserver export TRAINING_ROLE=PSERVER for((i=0;i<$PADDLE_PSERVER_NUMS;i++)) @@ -236,7 +237,7 @@ if [ ${MODE} = "benchmark_train" ]; then cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]} echo "PADDLE WILL START PSERVER "$cur_port export PADDLE_PORT=${cur_port} - cmd="${python} ${SC} ${BATCH}" + cmd="${python} ${SC} ${BATCH} ${EPOCH}" eval $cmd done @@ -246,7 +247,7 @@ if [ ${MODE} = "benchmark_train" ]; then do echo "PADDLE WILL START Trainer "$i export PADDLE_TRAINER_ID=$i - cmd="${python} ${SC} ${BATCH}" + cmd="${python} ${SC} ${BATCH} ${EPOCH}" eval $cmd done elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then From c87a4e5c5e26f915714c0cdb96f39e360492b062 Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Wed, 6 Jul 2022 02:34:41 +0000 Subject: [PATCH 07/10] update benchmark from qa --- test_tipc/benchmark_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index fa87f9f65..71e3eadc4 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -87,7 +87,7 @@ line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` batch_size=$(func_parser_value "${lines[line_num]}") line_num=`expr $line_num + 1` epoch=$(func_parser_value "${lines[line_num]}") -fp_items=$(func_parser_value "${lines[line_num]}") +fp_items="null" line_num=`expr $line_num + 1` profile_option_key=$(func_parser_key "${lines[line_num]}") profile_option_params=$(func_parser_value "${lines[line_num]}") From cd40dae4b3604c71ac9305e203eb52f90b1a5f65 Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Wed, 6 Jul 2022 03:12:49 +0000 Subject: [PATCH 08/10] update benchmark from qa --- test_tipc/benchmark_train.sh | 4 ++-- test_tipc/configs/dnn/train_infer_python.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index 71e3eadc4..5697eb913 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -202,7 +202,7 @@ for batch_size in ${batch_size_list[*]}; do --skip_steps 2 \ --device_num ${device_num} \ --speed_unit samples/s \ - --convergence_key loss: " + --convergence_key auc: " echo $cmd eval $cmd last_status=${PIPESTATUS[0]} @@ -239,7 +239,7 @@ for batch_size in ${batch_size_list[*]}; do --skip_steps 2 \ --device_num ${device_num} \ --speed_unit samples/s \ - --convergence_key loss: " + --convergence_key auc: " echo $cmd eval $cmd last_status=${PIPESTATUS[0]} diff --git a/test_tipc/configs/dnn/train_infer_python.txt b/test_tipc/configs/dnn/train_infer_python.txt index c914570b9..970ba7bd3 100755 --- a/test_tipc/configs/dnn/train_infer_python.txt +++ b/test_tipc/configs/dnn/train_infer_python.txt @@ -51,5 +51,5 @@ inference:-u tools/paddle_infer.py --model_name=dnn --reader_file=models/rank/dn null:null ===========================train_benchmark_params=========================== batchsize:2048 -epochs:6 +epochs:12 --profiler_options="batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile" From df9c9f5fb560efdb4168d0d476474d8c30e9adbe Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Wed, 6 Jul 2022 08:00:18 +0000 Subject: [PATCH 09/10] back to fluid --- tools/static_gpubox_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/static_gpubox_trainer.py b/tools/static_gpubox_trainer.py index 2a6757707..1dd05495d 100755 --- a/tools/static_gpubox_trainer.py +++ b/tools/static_gpubox_trainer.py @@ -28,7 +28,7 @@ import logging import profiler -from paddle.incubate.fleet.utils.fleet_util import FleetUtil +from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil fleet_util = FleetUtil() __dir__ = os.path.dirname(os.path.abspath(__file__)) From 08be45fa38b41bbe4e7a236892f37dda2147c114 Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Thu, 7 Jul 2022 03:14:33 +0000 Subject: [PATCH 10/10] update by qa --- test_tipc/benchmark_train.sh | 4 ++-- test_tipc/configs/dnn/train_infer_python.txt | 6 ++++-- tools/static_gpubox_trainer.py | 2 ++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index 5697eb913..214e3ebf3 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -201,7 +201,7 @@ for batch_size in ${batch_size_list[*]}; do --keyword ips: \ --skip_steps 2 \ --device_num ${device_num} \ - --speed_unit samples/s \ + --speed_unit example/s \ --convergence_key auc: " echo $cmd eval $cmd @@ -238,7 +238,7 @@ for batch_size in ${batch_size_list[*]}; do --keyword ips: \ --skip_steps 2 \ --device_num ${device_num} \ - --speed_unit samples/s \ + --speed_unit example/s \ --convergence_key auc: " echo $cmd eval $cmd diff --git a/test_tipc/configs/dnn/train_infer_python.txt b/test_tipc/configs/dnn/train_infer_python.txt index 970ba7bd3..08fc02e68 100755 --- a/test_tipc/configs/dnn/train_infer_python.txt +++ b/test_tipc/configs/dnn/train_infer_python.txt @@ -50,6 +50,8 @@ inference:-u tools/paddle_infer.py --model_name=dnn --reader_file=models/rank/dn --benchmark:True null:null ===========================train_benchmark_params=========================== -batchsize:2048 -epochs:12 +batch_size:2048 +epoch:50 --profiler_options="batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile" +run_mode:PSGPU +fp_items:null diff --git a/tools/static_gpubox_trainer.py b/tools/static_gpubox_trainer.py index 1dd05495d..1ec0d6034 100755 --- a/tools/static_gpubox_trainer.py +++ b/tools/static_gpubox_trainer.py @@ -152,6 +152,7 @@ def run_worker(self): self.PSGPU.set_slot_vector(gpuslot) self.PSGPU.set_slot_dim_vector(gpu_mf_sizes) self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")]) + gpu_num = len(gpus_env.split(",")) opt_info = paddle.static.default_main_program()._fleet_opt if use_auc is True: opt_info['stat_var_names'] = [ @@ -176,6 +177,7 @@ def run_worker(self): epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time + epoch_speed = epoch_speed / gpu_num if use_auc is True: global_auc = auc(self.model.stat_pos, self.model.stat_neg, paddle.static.global_scope(), fleet.util)