From b916b768e3cef927c35c09e800522e7a0db82785 Mon Sep 17 00:00:00 2001 From: Elias Bermudez <6505145+debermudez@users.noreply.github.com> Date: Wed, 8 Mar 2023 13:51:08 -0800 Subject: [PATCH] report overhead and request rate (#257) * Rename perf_status variables * More variable rename and print overhead percentage in verbose mode * Add per measurement client overhead output * Remove per pass overhead reporting * Add send request rate reporting * Check if overhead exceeds threshold * Revert formatting on cmake file * Add avg send rate to verbose output * Update variable names and adjust default overhead threshold default value --- src/c++/perf_analyzer/command_line_parser.h | 7 + src/c++/perf_analyzer/inference_profiler.cc | 255 ++++++++++++-------- src/c++/perf_analyzer/inference_profiler.h | 50 ++-- src/c++/perf_analyzer/perf_analyzer.cc | 12 +- src/c++/perf_analyzer/perf_analyzer.h | 2 +- 5 files changed, 195 insertions(+), 131 deletions(-) diff --git a/src/c++/perf_analyzer/command_line_parser.h b/src/c++/perf_analyzer/command_line_parser.h index 16d9fa02c..ae37dcde4 100644 --- a/src/c++/perf_analyzer/command_line_parser.h +++ b/src/c++/perf_analyzer/command_line_parser.h @@ -127,6 +127,13 @@ struct PerfAnalyzerParameters { using_concurrency_range || using_old_options || !(using_request_rate_range || using_custom_intervals)); } + + // Sets the threshold for PA client overhead. + // Overhead is defined as the percentage of time when PA is doing work and + // requests are not outstanding to the triton server. If the overhead + // percentage exceeds the threshold, a warning is displayed. + // + double overhead_pct_threshold{50.0}; }; using PAParamsPtr = std::shared_ptr; diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc index b17a3052e..6ac548d0e 100644 --- a/src/c++/perf_analyzer/inference_profiler.cc +++ b/src/c++/perf_analyzer/inference_profiler.cc @@ -28,8 +28,11 @@ #include #include +#include +#include #include #include +#include #include #include "client_backend/client_backend.h" #include "doctest.h" @@ -323,7 +326,8 @@ cb::Error ReportClientSideStats( const ClientSideStats& stats, const int64_t percentile, const cb::ProtocolType protocol, const bool verbose, - const bool on_sequence_model, const bool include_lib_stats) + const bool on_sequence_model, const bool include_lib_stats, + const double overhead_pct, const double send_request_rate) { const uint64_t avg_latency_us = stats.avg_latency_ns / 1000; const uint64_t std_us = stats.std_us; @@ -382,6 +386,20 @@ ReportClientSideStats( } std::cout << " Throughput: " << stats.infer_per_sec << " infer/sec" << std::endl; + + if (verbose) { + std::stringstream client_overhead{""}; + std::stringstream send_rate{""}; + client_overhead << " " + << "Avg client overhead: " << std::fixed + << std::setprecision(2) << overhead_pct << "%"; + send_rate << " " + << "Avg send request rate: " << std::fixed << std::setprecision(2) + << send_request_rate << " infer/sec"; + std::cout << client_overhead.str() << std::endl; + std::cout << send_rate.str() << std::endl; + } + if (percentile == -1) { std::cout << " Avg latency: " << avg_latency_us << " usec" << " (standard deviation " << std_us << " usec)" << std::endl; @@ -403,12 +421,13 @@ Report( const cb::ProtocolType protocol, const bool verbose, const bool include_lib_stats, const bool include_server_stats, const std::shared_ptr& parser, - const bool should_collect_metrics) + const bool should_collect_metrics, const double overhead_pct_threshold) { std::cout << " Client: " << std::endl; ReportClientSideStats( summary.client_stats, percentile, protocol, verbose, - summary.on_sequence_model, include_lib_stats); + summary.on_sequence_model, include_lib_stats, summary.overhead_pct, + summary.send_request_rate); if (include_server_stats) { std::cout << " Server: " << std::endl; @@ -420,6 +439,11 @@ Report( ReportPrometheusMetrics(summary.metrics.front()); } + if (summary.overhead_pct > overhead_pct_threshold) { + std::cout << "[WARNING] Perf Analyzer is not able to keep up with the " + "desired load. The results may not be accurate." + << std::endl; + } return cb::Error::Success; } @@ -436,14 +460,14 @@ InferenceProfiler::Create( std::unique_ptr* profiler, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, const uint64_t metrics_interval_ms, - const bool should_collect_metrics) + const bool should_collect_metrics, const double overhead_pct_threshold) { std::unique_ptr local_profiler(new InferenceProfiler( verbose, stability_threshold, measurement_window_ms, max_trials, (percentile != -1), percentile, latency_threshold_ms_, protocol, parser, profile_backend, std::move(manager), measurement_request_count, - measurement_mode, mpi_driver, metrics_interval_ms, - should_collect_metrics)); + measurement_mode, mpi_driver, metrics_interval_ms, should_collect_metrics, + overhead_pct_threshold)); *profiler = std::move(local_profiler); return cb::Error::Success; @@ -458,7 +482,8 @@ InferenceProfiler::InferenceProfiler( std::shared_ptr profile_backend, std::unique_ptr manager, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, - const uint64_t metrics_interval_ms, const bool should_collect_metrics) + const uint64_t metrics_interval_ms, const bool should_collect_metrics, + const double overhead_pct_threshold) : verbose_(verbose), measurement_window_ms_(measurement_window_ms), max_trials_(max_trials), extra_percentile_(extra_percentile), percentile_(percentile), latency_threshold_ms_(latency_threshold_ms_), @@ -466,7 +491,8 @@ InferenceProfiler::InferenceProfiler( manager_(std::move(manager)), measurement_request_count_(measurement_request_count), measurement_mode_(measurement_mode), mpi_driver_(mpi_driver), - should_collect_metrics_(should_collect_metrics) + should_collect_metrics_(should_collect_metrics), + overhead_pct_threshold_(overhead_pct_threshold) { load_parameters_.stability_threshold = stability_threshold; load_parameters_.stability_window = 3; @@ -492,13 +518,14 @@ InferenceProfiler::InferenceProfiler( cb::Error InferenceProfiler::Profile( - const size_t concurrent_request_count, std::vector& summary, - bool& meets_threshold, bool& is_stable) + const size_t concurrent_request_count, + std::vector& perf_statuses, bool& meets_threshold, + bool& is_stable) { cb::Error err; - PerfStatus status_summary; + PerfStatus perf_status; - status_summary.concurrency = concurrent_request_count; + perf_status.concurrency = concurrent_request_count; is_stable = false; meets_threshold = true; @@ -506,11 +533,11 @@ InferenceProfiler::Profile( RETURN_IF_ERROR(dynamic_cast(manager_.get()) ->ChangeConcurrencyLevel(concurrent_request_count)); - err = ProfileHelper(status_summary, &is_stable); + err = ProfileHelper(perf_status, &is_stable); if (err.IsOk()) { - summary.push_back(status_summary); + perf_statuses.push_back(perf_status); uint64_t stabilizing_latency_ms = - status_summary.stabilizing_latency_ns / NANOS_PER_MILLIS; + perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS; if ((stabilizing_latency_ms >= latency_threshold_ms_) && (latency_threshold_ms_ != NO_LIMIT)) { std::cerr << "Measured latency went over the set limit of " @@ -531,8 +558,9 @@ InferenceProfiler::Profile( meets_threshold = false; } else { err = Report( - status_summary, percentile_, protocol_, verbose_, include_lib_stats_, - include_server_stats_, parser_, should_collect_metrics_); + perf_status, percentile_, protocol_, verbose_, include_lib_stats_, + include_server_stats_, parser_, should_collect_metrics_, + overhead_pct_threshold_); if (!err.IsOk()) { std::cerr << err; meets_threshold = false; @@ -547,13 +575,13 @@ InferenceProfiler::Profile( cb::Error InferenceProfiler::Profile( - const double request_rate, std::vector& summary, + const double request_rate, std::vector& perf_statuses, bool& meets_threshold, bool& is_stable) { cb::Error err; - PerfStatus status_summary; + PerfStatus perf_status; - status_summary.request_rate = request_rate; + perf_status.request_rate = request_rate; is_stable = false; meets_threshold = true; @@ -563,11 +591,11 @@ InferenceProfiler::Profile( std::cout << "Request Rate: " << request_rate << " inference requests per seconds" << std::endl; - err = ProfileHelper(status_summary, &is_stable); + err = ProfileHelper(perf_status, &is_stable); if (err.IsOk()) { - summary.push_back(status_summary); + perf_statuses.push_back(perf_status); uint64_t stabilizing_latency_ms = - status_summary.stabilizing_latency_ns / NANOS_PER_MILLIS; + perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS; if ((stabilizing_latency_ms >= latency_threshold_ms_) && (latency_threshold_ms_ != NO_LIMIT)) { std::cerr << "Measured latency went over the set limit of " @@ -578,8 +606,9 @@ InferenceProfiler::Profile( meets_threshold = false; } else { err = Report( - status_summary, percentile_, protocol_, verbose_, include_lib_stats_, - include_server_stats_, parser_, should_collect_metrics_); + perf_status, percentile_, protocol_, verbose_, include_lib_stats_, + include_server_stats_, parser_, should_collect_metrics_, + overhead_pct_threshold_); if (!err.IsOk()) { std::cerr << err; meets_threshold = false; @@ -594,24 +623,25 @@ InferenceProfiler::Profile( cb::Error InferenceProfiler::Profile( - std::vector& summary, bool& meets_threshold, bool& is_stable) + std::vector& perf_statuses, bool& meets_threshold, + bool& is_stable) { cb::Error err; - PerfStatus status_summary; + PerfStatus perf_status; RETURN_IF_ERROR( dynamic_cast(manager_.get())->InitCustomIntervals()); RETURN_IF_ERROR(dynamic_cast(manager_.get()) - ->GetCustomRequestRate(&status_summary.request_rate)); + ->GetCustomRequestRate(&perf_status.request_rate)); is_stable = false; meets_threshold = true; - err = ProfileHelper(status_summary, &is_stable); + err = ProfileHelper(perf_status, &is_stable); if (err.IsOk()) { - summary.push_back(status_summary); + perf_statuses.push_back(perf_status); uint64_t stabilizing_latency_ms = - status_summary.stabilizing_latency_ns / NANOS_PER_MILLIS; + perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS; if ((stabilizing_latency_ms >= latency_threshold_ms_) && (latency_threshold_ms_ != NO_LIMIT)) { std::cerr << "Measured latency went over the set limit of " @@ -622,8 +652,9 @@ InferenceProfiler::Profile( meets_threshold = false; } else { err = Report( - status_summary, percentile_, protocol_, verbose_, include_lib_stats_, - include_server_stats_, parser_, should_collect_metrics_); + perf_status, percentile_, protocol_, verbose_, include_lib_stats_, + include_server_stats_, parser_, should_collect_metrics_, + overhead_pct_threshold_); if (!err.IsOk()) { std::cerr << err; meets_threshold = false; @@ -637,13 +668,14 @@ InferenceProfiler::Profile( } cb::Error -InferenceProfiler::ProfileHelper(PerfStatus& status_summary, bool* is_stable) +InferenceProfiler::ProfileHelper( + PerfStatus& experiment_perf_status, bool* is_stable) { // Start measurement LoadStatus load_status; size_t completed_trials = 0; std::queue error; - std::deque perf_status; + std::deque measurement_perf_statuses; all_timestamps_.clear(); previous_window_end_ns_ = 0; @@ -653,25 +685,28 @@ InferenceProfiler::ProfileHelper(PerfStatus& status_summary, bool* is_stable) RETURN_IF_ERROR(manager_->SwapTimestamps(empty_timestamps)); do { - PerfStatus status_summary; + PerfStatus measurement_perf_status; RETURN_IF_ERROR(manager_->CheckHealth()); if (measurement_mode_ == MeasurementMode::TIME_WINDOWS) { - error.push(Measure(status_summary, measurement_window_ms_, false)); + error.push( + Measure(measurement_perf_status, measurement_window_ms_, false)); } else { - error.push(Measure(status_summary, measurement_request_count_, true)); + error.push( + Measure(measurement_perf_status, measurement_request_count_, true)); } - perf_status.push_back(status_summary); + measurement_perf_statuses.push_back(measurement_perf_status); if (error.size() > load_parameters_.stability_window) { error.pop(); - perf_status.pop_front(); + measurement_perf_statuses.pop_front(); } if (error.back().IsOk()) { load_status.infer_per_sec.push_back( - status_summary.client_stats.infer_per_sec); - load_status.latencies.push_back(status_summary.stabilizing_latency_ns); + measurement_perf_status.client_stats.infer_per_sec); + load_status.latencies.push_back( + measurement_perf_status.stabilizing_latency_ns); } else { load_status.infer_per_sec.push_back(0); load_status.latencies.push_back(std::numeric_limits::max()); @@ -688,16 +723,18 @@ InferenceProfiler::ProfileHelper(PerfStatus& status_summary, bool* is_stable) << " infer/sec. "; if (extra_percentile_) { std::cout << "p" << percentile_ << " latency: " - << (status_summary.client_stats.percentile_latency_ns - .find(percentile_) + << (measurement_perf_status.client_stats + .percentile_latency_ns.find(percentile_) ->second / 1000) << " usec" << std::endl; } else { std::cout << "Avg latency: " - << (status_summary.client_stats.avg_latency_ns / 1000) - << " usec (std " << status_summary.client_stats.std_us - << " usec)" << std::endl; + << (measurement_perf_status.client_stats.avg_latency_ns / + 1000) + << " usec (std " + << measurement_perf_status.client_stats.std_us << " usec). " + << std::endl; } } else { std::cout << " Pass [" << (completed_trials + 1) @@ -730,7 +767,8 @@ InferenceProfiler::ProfileHelper(PerfStatus& status_summary, bool* is_stable) // Only merge the results if the results have stabilized. if (*is_stable) { - RETURN_IF_ERROR(MergePerfStatusReports(perf_status, status_summary)); + RETURN_IF_ERROR(MergePerfStatusReports( + measurement_perf_statuses, experiment_perf_status)); } if (early_exit) { @@ -909,15 +947,16 @@ InferenceProfiler::MergeServerSideStats( cb::Error InferenceProfiler::MergePerfStatusReports( - std::deque& perf_status_reports, PerfStatus& summary_status) + std::deque& perf_status_reports, + PerfStatus& experiment_perf_status) { auto& perf_status = perf_status_reports[0]; // Make sure that the perf status reports profiling settings match with each // other. for (size_t i = 1; i < perf_status_reports.size(); i++) { - perf_status.concurrency = summary_status.concurrency; - perf_status.request_rate = summary_status.request_rate; + perf_status.concurrency = experiment_perf_status.concurrency; + perf_status.request_rate = experiment_perf_status.request_rate; if (perf_status_reports[i].on_sequence_model != perf_status.on_sequence_model) { @@ -937,95 +976,104 @@ InferenceProfiler::MergePerfStatusReports( } } - summary_status.batch_size = perf_status.batch_size; - summary_status.on_sequence_model = perf_status.on_sequence_model; + experiment_perf_status.batch_size = perf_status.batch_size; + experiment_perf_status.on_sequence_model = perf_status.on_sequence_model; // Initialize the client stats for the merged report. - summary_status.client_stats.request_count = 0; - summary_status.client_stats.sequence_count = 0; - summary_status.client_stats.delayed_request_count = 0; - summary_status.client_stats.duration_ns = 0; - summary_status.client_stats.avg_latency_ns = 0; - summary_status.client_stats.percentile_latency_ns.clear(); - summary_status.client_stats.latencies.clear(); - summary_status.client_stats.std_us = 0; - summary_status.client_stats.avg_request_time_ns = 0; - summary_status.client_stats.avg_send_time_ns = 0; - summary_status.client_stats.avg_receive_time_ns = 0; - summary_status.client_stats.infer_per_sec = 0; - summary_status.client_stats.sequence_per_sec = 0; - summary_status.client_stats.completed_count = 0; - summary_status.stabilizing_latency_ns = 0; + experiment_perf_status.client_stats.request_count = 0; + experiment_perf_status.client_stats.sequence_count = 0; + experiment_perf_status.client_stats.delayed_request_count = 0; + experiment_perf_status.client_stats.duration_ns = 0; + experiment_perf_status.client_stats.avg_latency_ns = 0; + experiment_perf_status.client_stats.percentile_latency_ns.clear(); + experiment_perf_status.client_stats.latencies.clear(); + experiment_perf_status.client_stats.std_us = 0; + experiment_perf_status.client_stats.avg_request_time_ns = 0; + experiment_perf_status.client_stats.avg_send_time_ns = 0; + experiment_perf_status.client_stats.avg_receive_time_ns = 0; + experiment_perf_status.client_stats.infer_per_sec = 0; + experiment_perf_status.client_stats.sequence_per_sec = 0; + experiment_perf_status.client_stats.completed_count = 0; + experiment_perf_status.stabilizing_latency_ns = 0; std::vector server_side_stats; for (auto& perf_status : perf_status_reports) { // Aggregated Client Stats - summary_status.client_stats.request_count += + experiment_perf_status.client_stats.request_count += perf_status.client_stats.request_count; - summary_status.client_stats.sequence_count += + experiment_perf_status.client_stats.sequence_count += perf_status.client_stats.sequence_count; - summary_status.client_stats.delayed_request_count += + experiment_perf_status.client_stats.delayed_request_count += perf_status.client_stats.delayed_request_count; - summary_status.client_stats.duration_ns += + experiment_perf_status.client_stats.duration_ns += perf_status.client_stats.duration_ns; server_side_stats.push_back(perf_status.server_stats); - summary_status.client_stats.latencies.insert( - summary_status.client_stats.latencies.end(), + experiment_perf_status.client_stats.latencies.insert( + experiment_perf_status.client_stats.latencies.end(), perf_status.client_stats.latencies.begin(), perf_status.client_stats.latencies.end()); + // Accumulate the overhead percentage and send rate here to remove extra + // traversals over the perf_status_reports + experiment_perf_status.overhead_pct += perf_status.overhead_pct; + experiment_perf_status.send_request_rate += perf_status.send_request_rate; } + // Calculate the average overhead_pct for the experiment. + experiment_perf_status.overhead_pct /= perf_status_reports.size(); + experiment_perf_status.send_request_rate /= perf_status_reports.size(); + if (include_lib_stats_) { for (auto& perf_status : perf_status_reports) { - summary_status.client_stats.completed_count += + experiment_perf_status.client_stats.completed_count += perf_status.client_stats.completed_count; - summary_status.client_stats.avg_request_time_ns += + experiment_perf_status.client_stats.avg_request_time_ns += perf_status.client_stats.avg_request_time_ns * perf_status.client_stats.completed_count; - summary_status.client_stats.avg_send_time_ns += + experiment_perf_status.client_stats.avg_send_time_ns += perf_status.client_stats.avg_send_time_ns * perf_status.client_stats.completed_count; - summary_status.client_stats.avg_receive_time_ns += + experiment_perf_status.client_stats.avg_receive_time_ns += perf_status.client_stats.avg_receive_time_ns * perf_status.client_stats.completed_count; } - if (summary_status.client_stats.completed_count != 0) { - summary_status.client_stats.avg_request_time_ns = - summary_status.client_stats.avg_request_time_ns / - summary_status.client_stats.completed_count; + if (experiment_perf_status.client_stats.completed_count != 0) { + experiment_perf_status.client_stats.avg_request_time_ns = + experiment_perf_status.client_stats.avg_request_time_ns / + experiment_perf_status.client_stats.completed_count; - summary_status.client_stats.avg_send_time_ns = - summary_status.client_stats.avg_send_time_ns / - summary_status.client_stats.completed_count; + experiment_perf_status.client_stats.avg_send_time_ns = + experiment_perf_status.client_stats.avg_send_time_ns / + experiment_perf_status.client_stats.completed_count; - summary_status.client_stats.avg_receive_time_ns = - summary_status.client_stats.avg_receive_time_ns / - summary_status.client_stats.completed_count; + experiment_perf_status.client_stats.avg_receive_time_ns = + experiment_perf_status.client_stats.avg_receive_time_ns / + experiment_perf_status.client_stats.completed_count; } } - RETURN_IF_ERROR( - MergeServerSideStats(server_side_stats, summary_status.server_stats)); + RETURN_IF_ERROR(MergeServerSideStats( + server_side_stats, experiment_perf_status.server_stats)); std::sort( - summary_status.client_stats.latencies.begin(), - summary_status.client_stats.latencies.end()); + experiment_perf_status.client_stats.latencies.begin(), + experiment_perf_status.client_stats.latencies.end()); float client_duration_sec = - (float)summary_status.client_stats.duration_ns / NANOS_PER_SECOND; - summary_status.client_stats.sequence_per_sec = - summary_status.client_stats.sequence_count / client_duration_sec; - summary_status.client_stats.infer_per_sec = - (summary_status.client_stats.request_count * summary_status.batch_size) / + (float)experiment_perf_status.client_stats.duration_ns / NANOS_PER_SECOND; + experiment_perf_status.client_stats.sequence_per_sec = + experiment_perf_status.client_stats.sequence_count / client_duration_sec; + experiment_perf_status.client_stats.infer_per_sec = + (experiment_perf_status.client_stats.request_count * + experiment_perf_status.batch_size) / client_duration_sec; - RETURN_IF_ERROR( - SummarizeLatency(summary_status.client_stats.latencies, summary_status)); + RETURN_IF_ERROR(SummarizeLatency( + experiment_perf_status.client_stats.latencies, experiment_perf_status)); if (should_collect_metrics_) { // Put all Metric objects in a flat vector so they're easier to merge @@ -1040,7 +1088,7 @@ InferenceProfiler::MergePerfStatusReports( Metrics merged_metrics{}; RETURN_IF_ERROR(MergeMetrics(all_metrics, merged_metrics)); - summary_status.metrics.push_back(std::move(merged_metrics)); + experiment_perf_status.metrics.push_back(std::move(merged_metrics)); } return cb::Error::Success; @@ -1063,8 +1111,7 @@ InferenceProfiler::GetServerSideStatus( // Used for measurement cb::Error InferenceProfiler::Measure( - PerfStatus& status_summary, uint64_t measurement_window, - bool is_count_based) + PerfStatus& perf_status, uint64_t measurement_window, bool is_count_based) { std::map start_status; std::map end_status; @@ -1122,7 +1169,7 @@ InferenceProfiler::Measure( previous_window_end_ns_ = window_end_ns; if (should_collect_metrics_) { - metrics_manager_->GetLatestMetrics(status_summary.metrics); + metrics_manager_->GetLatestMetrics(perf_status.metrics); } // Get server status and then print report on difference between @@ -1142,7 +1189,7 @@ InferenceProfiler::Measure( current_timestamps.end()); RETURN_IF_ERROR(Summarize( - start_status, end_status, start_stat, end_stat, status_summary, + start_status, end_status, start_stat, end_stat, perf_status, window_start_ns, window_end_ns)); return cb::Error::Success; diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h index 21bb3b871..9867ee8cc 100644 --- a/src/c++/perf_analyzer/inference_profiler.h +++ b/src/c++/perf_analyzer/inference_profiler.h @@ -214,6 +214,8 @@ class InferenceProfiler { /// \param metrics_interval_ms The interval at which the server-side metrics /// \param should_collect_metrics Whether server-side inference server metrics /// should be collected. + /// \param overhead_pct_threshold User set threshold above which the PA + /// overhead is too significant to provide useable results. /// \return cb::Error object indicating success or failure. static cb::Error Create( const bool verbose, const double stability_threshold, @@ -225,7 +227,7 @@ class InferenceProfiler { std::unique_ptr* profiler, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, const uint64_t metrics_interval_ms, - const bool should_collect_metrics); + const bool should_collect_metrics, const double overhead_pct_threshold); /// Performs the profiling on the given range with the given search algorithm. /// For profiling using request rate invoke template with double, otherwise @@ -241,19 +243,19 @@ class InferenceProfiler { template cb::Error Profile( const T start, const T end, const T step, const SearchMode search_mode, - std::vector& summary) + std::vector& perf_statuses) { cb::Error err; bool meets_threshold, is_stable; if (search_mode == SearchMode::NONE) { - err = Profile(summary, meets_threshold, is_stable); + err = Profile(perf_statuses, meets_threshold, is_stable); if (!err.IsOk()) { return err; } } else if (search_mode == SearchMode::LINEAR) { T current_value = start; do { - err = Profile(current_value, summary, meets_threshold, is_stable); + err = Profile(current_value, perf_statuses, meets_threshold, is_stable); if (!err.IsOk()) { return err; } @@ -267,11 +269,11 @@ class InferenceProfiler { "Failed to obtain stable measurement.", pa::STABILITY_ERROR); } } else { - err = Profile(start, summary, meets_threshold, is_stable); + err = Profile(start, perf_statuses, meets_threshold, is_stable); if (!err.IsOk() || (!meets_threshold)) { return err; } - err = Profile(end, summary, meets_threshold, is_stable); + err = Profile(end, perf_statuses, meets_threshold, is_stable); if (!err.IsOk() || (meets_threshold)) { return err; } @@ -280,7 +282,7 @@ class InferenceProfiler { T this_end = end; while ((this_end - this_start) > step) { T current_value = (this_end + this_start) / 2; - err = Profile(current_value, summary, meets_threshold, is_stable); + err = Profile(current_value, perf_statuses, meets_threshold, is_stable); if (!err.IsOk()) { return err; } @@ -306,7 +308,8 @@ class InferenceProfiler { std::shared_ptr profile_backend, std::unique_ptr manager, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, - const uint64_t metrics_interval_ms, const bool should_collect_metrics); + const uint64_t metrics_interval_ms, const bool should_collect_metrics, + const double overhead_pct_threshold); /// Actively measure throughput in every 'measurement_window' msec until the /// throughput is stable. Once the throughput is stable, it adds the @@ -317,35 +320,38 @@ class InferenceProfiler { /// measures (we can't get the exact server status right before the first /// request and right after the last request in the measurement window). /// \param concurrent_request_count The concurrency level for the measurement. - /// \param summary Appends the measurements summary at the end of this list. - /// \param meets_threshold Returns whether the setting meets the threshold. + /// \param perf_statuses Appends the measurements summary at the end of this + /// list. \param meets_threshold Returns whether the setting meets the + /// threshold. /// \param is_stable Returns whether the measurement is stable. /// \return cb::Error object indicating success or failure. cb::Error Profile( - const size_t concurrent_request_count, std::vector& summary, - bool& meets_threshold, bool& is_stable); + const size_t concurrent_request_count, + std::vector& perf_statuses, bool& meets_threshold, + bool& is_stable); /// Similar to above function, but instead of setting the concurrency, it /// sets the specified request rate for measurements. /// \param request_rate The request rate for inferences. - /// \param summary Appends the measurements summary at the end of this list. - /// \param meets_threshold Returns whether the setting meets the threshold. - /// \param is_stable Returns whether the measurement is stable. + /// \param perf_statuses Appends the measurements summary at the end of this + /// list. \param meets_threshold Returns whether the setting meets the + /// threshold. \param is_stable Returns whether the measurement is stable. /// \return cb::Error object indicating success or failure. cb::Error Profile( - const double request_rate, std::vector& summary, + const double request_rate, std::vector& perf_statuses, bool& meets_threshold, bool& is_stable); /// Measures throughput and latencies for custom load without controling /// request rate nor concurrency. Requires load manager to be loaded with /// a file specifying the time intervals. - /// \param summary Appends the measurements summary at the end of this list. - /// \param meets_threshold Returns whether the measurement met the threshold. - /// \param is_stable Returns whether the measurement is stable. + /// \param perf_statuses Appends the measurements summary at the end of this + /// list. \param meets_threshold Returns whether the measurement met the + /// threshold. \param is_stable Returns whether the measurement is stable. /// \return cb::Error object indicating success /// or failure. cb::Error Profile( - std::vector& summary, bool& meets_threshold, bool& is_stable); + std::vector& perf_statuses, bool& meets_threshold, + bool& is_stable); /// A helper function for profiling functions. /// \param status_summary Returns the summary of the measurement. @@ -669,6 +675,10 @@ class InferenceProfiler { /// Whether server-side inference server metrics should be collected. bool should_collect_metrics_{false}; + /// User set threshold above which the PA overhead is too significant to + /// provide useable results. + const double overhead_pct_threshold_{0.0}; + #ifndef DOCTEST_CONFIG_DISABLE friend TestInferenceProfiler; diff --git a/src/c++/perf_analyzer/perf_analyzer.cc b/src/c++/perf_analyzer/perf_analyzer.cc index 1fe6dcf2b..3b203f63b 100644 --- a/src/c++/perf_analyzer/perf_analyzer.cc +++ b/src/c++/perf_analyzer/perf_analyzer.cc @@ -256,7 +256,7 @@ PerfAnalyzer::CreateAnalyzerObjects() parser_, std::move(backend_), std::move(manager), &profiler_, params_->measurement_request_count, params_->measurement_mode, params_->mpi_driver, params_->metrics_interval_ms, - params_->should_collect_metrics), + params_->should_collect_metrics, params_->overhead_pct_threshold), "failed to create profiler"); } @@ -354,13 +354,13 @@ PerfAnalyzer::Profile() if (params_->targeting_concurrency()) { err = profiler_->Profile( params_->concurrency_range.start, params_->concurrency_range.end, - params_->concurrency_range.step, params_->search_mode, summary_); + params_->concurrency_range.step, params_->search_mode, perf_statuses_); } else { err = profiler_->Profile( params_->request_rate_range[pa::SEARCH_RANGE::kSTART], params_->request_rate_range[pa::SEARCH_RANGE::kEND], params_->request_rate_range[pa::SEARCH_RANGE::kSTEP], - params_->search_mode, summary_); + params_->search_mode, perf_statuses_); } params_->mpi_driver->MPIBarrierWorld(); @@ -378,7 +378,7 @@ PerfAnalyzer::Profile() void PerfAnalyzer::WriteReport() { - if (!summary_.size()) { + if (!perf_statuses_.size()) { return; } @@ -390,7 +390,7 @@ PerfAnalyzer::WriteReport() std::cout << "p" << params_->percentile << " Batch Latency" << std::endl; } - for (pa::PerfStatus& status : summary_) { + for (pa::PerfStatus& status : perf_statuses_) { if (params_->targeting_concurrency()) { std::cout << "Concurrency: " << status.concurrency << ", "; } else { @@ -408,7 +408,7 @@ PerfAnalyzer::WriteReport() FAIL_IF_ERR( pa::ReportWriter::Create( - params_->filename, params_->targeting_concurrency(), summary_, + params_->filename, params_->targeting_concurrency(), perf_statuses_, params_->verbose_csv, profiler_->IncludeServerStats(), params_->percentile, parser_, &writer, should_output_metrics), "failed to create report writer"); diff --git a/src/c++/perf_analyzer/perf_analyzer.h b/src/c++/perf_analyzer/perf_analyzer.h index e51102acc..904aefe71 100644 --- a/src/c++/perf_analyzer/perf_analyzer.h +++ b/src/c++/perf_analyzer/perf_analyzer.h @@ -179,7 +179,7 @@ class PerfAnalyzer { std::unique_ptr profiler_; std::unique_ptr backend_; std::shared_ptr parser_; - std::vector summary_; + std::vector perf_statuses_; // // Helper methods