diff --git a/.gitignore b/.gitignore index 73a5eca1b..eb7705cd5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ /.vscode *.so node_modules +src/c++/perf_analyzer/builddir/ +src/c++/perf_analyzer/.vscode/ diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc index dea52aca5..35f17f897 100644 --- a/src/c++/perf_analyzer/inference_profiler.cc +++ b/src/c++/perf_analyzer/inference_profiler.cc @@ -638,57 +638,12 @@ InferenceProfiler::ProfileHelper( } } - if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) { - size_t idx = - load_status.infer_per_sec.size() - load_parameters_.stability_window; - if (load_status.infer_per_sec.size() > - load_parameters_.stability_window) { - load_status.avg_ips -= load_status.infer_per_sec[idx - 1] / - load_parameters_.stability_window; - load_status.avg_latency -= - load_status.latencies[idx - 1] / load_parameters_.stability_window; - } - *is_stable = true; - bool within_threshold = false; - for (; idx < load_status.infer_per_sec.size(); idx++) { - if (load_status.infer_per_sec[idx] == 0) { - *is_stable = false; - } - if ((load_status.latencies[idx] < - (latency_threshold_ms_ * 1000 * 1000))) { - within_threshold = true; - } + *is_stable = DetermineStability(load_status); - // We call it complete only if stability_window measurements are within - // +/-(stability_threshold)% of the average infer per second and latency - if ((load_status.infer_per_sec[idx] < - load_status.avg_ips * - (1 - load_parameters_.stability_threshold)) || - (load_status.infer_per_sec[idx] > - load_status.avg_ips * - (1 + load_parameters_.stability_threshold))) { - *is_stable = false; - } - if ((load_status.latencies[idx] < - load_status.avg_latency * - (1 - load_parameters_.stability_threshold)) || - (load_status.latencies[idx] > - load_status.avg_latency * - (1 + load_parameters_.stability_threshold))) { - *is_stable = false; - } - } - if (mpi_driver_->IsMPIRun()) { - if (AllMPIRanksAreStable(*is_stable)) { - break; - } - } else if (*is_stable) { - break; - } - if ((!within_threshold) && (latency_threshold_ms_ != NO_LIMIT)) { - break; - } + if (IsDoneProfiling(load_status, is_stable)) { + break; } + completed_trials++; } while ((!early_exit) && (completed_trials < max_trials_)); @@ -710,6 +665,93 @@ InferenceProfiler::ProfileHelper( return cb::Error::Success; } +bool +InferenceProfiler::DetermineStability(LoadStatus& load_status) +{ + bool stable = false; + if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) { + stable = true; + size_t idx = + load_status.infer_per_sec.size() - load_parameters_.stability_window; + + for (size_t i = idx; i < load_status.infer_per_sec.size(); i++) { + if (load_status.infer_per_sec[i] == 0) { + stable = false; + } + } + + stable = stable && CheckWindowForStability(idx, load_status); + } + return stable; +} + +bool +InferenceProfiler::CheckWindowForStability(size_t idx, LoadStatus& load_status) +{ + return IsInferWindowStable(idx, load_status) && + IsLatencyWindowStable(idx, load_status); +} + +bool +InferenceProfiler::IsInferWindowStable(size_t idx, LoadStatus& load_status) +{ + auto infer_start = std::begin(load_status.infer_per_sec) + idx; + auto infer_per_sec_measurements = std::minmax_element( + infer_start, infer_start + load_parameters_.stability_window); + + auto max_infer_per_sec = *infer_per_sec_measurements.second; + auto min_infer_per_sec = *infer_per_sec_measurements.first; + + return max_infer_per_sec / min_infer_per_sec <= + 1 + load_parameters_.stability_threshold; +} + +bool +InferenceProfiler::IsLatencyWindowStable(size_t idx, LoadStatus& load_status) +{ + auto latency_start = std::begin(load_status.latencies) + idx; + auto latencies_per_sec_measurements = std::minmax_element( + latency_start, latency_start + load_parameters_.stability_window); + + auto max_latency = *latencies_per_sec_measurements.second; + auto min_latency = *latencies_per_sec_measurements.first; + + return max_latency / min_latency <= 1 + load_parameters_.stability_threshold; +} + +bool +InferenceProfiler::IsDoneProfiling(LoadStatus& load_status, bool* is_stable) +{ + bool done = false; + bool within_threshold = true; + if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) { + size_t idx = + load_status.infer_per_sec.size() - load_parameters_.stability_window; + + for (; idx < load_status.infer_per_sec.size(); idx++) { + within_threshold &= CheckWithinThreshold(idx, load_status); + } + } + + if (mpi_driver_->IsMPIRun()) { + if (AllMPIRanksAreStable(*is_stable)) { + done = true; + } + } else if (*is_stable) { + done = true; + } + if ((!within_threshold) && (latency_threshold_ms_ != NO_LIMIT)) { + done = true; + } + return done; +} + +bool +InferenceProfiler::CheckWithinThreshold(size_t idx, LoadStatus& load_status) +{ + return load_status.latencies[idx] < (latency_threshold_ms_ * 1000 * 1000); +} + cb::Error InferenceProfiler::MergeServerSideStats( std::vector& server_side_stats, @@ -1172,8 +1214,8 @@ InferenceProfiler::SummarizeServerStatsHelper( const std::map& end_status, ServerSideStats* server_stats) { - // If model_version is an empty string then look in the end status to find the - // latest (highest valued version) and use that as the version. + // If model_version is an empty string then look in the end status to find + // the latest (highest valued version) and use that as the version. int64_t status_model_version = -1; if (model_identifier.second.empty()) { for (const auto& id : end_status) { @@ -1346,6 +1388,50 @@ class TestInferenceProfiler { inference_profiler.ValidLatencyMeasurement( valid_range, valid_sequence_count, delayed_request_count, latencies); } + + + static bool TestCheckWithinThreshold( + LoadStatus& ls, LoadParams& lp, uint64_t latency_threshold_ms) + { + InferenceProfiler ip; + size_t idx = ls.infer_per_sec.size() - lp.stability_window; + ip.latency_threshold_ms_ = latency_threshold_ms; + + return ip.CheckWithinThreshold(idx, ls); + } + + static bool TestCheckWindowForStability(LoadStatus& ls, LoadParams& lp) + { + size_t idx = ls.infer_per_sec.size() - lp.stability_window; + + InferenceProfiler ip; + ip.load_parameters_.stability_threshold = lp.stability_threshold; + ip.load_parameters_.stability_window = lp.stability_window; + + return ip.CheckWindowForStability(idx, ls); + }; + + static bool TestDetermineStability(LoadStatus& ls, LoadParams& lp) + { + InferenceProfiler ip; + ip.load_parameters_.stability_threshold = lp.stability_threshold; + ip.load_parameters_.stability_window = lp.stability_window; + + return ip.DetermineStability(ls); + } + + static bool TestIsDoneProfiling( + LoadStatus& ls, LoadParams& lp, uint64_t latency_threshold_ms) + { + InferenceProfiler ip; + ip.load_parameters_.stability_threshold = lp.stability_threshold; + ip.load_parameters_.stability_window = lp.stability_window; + ip.latency_threshold_ms_ = latency_threshold_ms; + ip.mpi_driver_ = std::make_shared(false); + + bool is_stable = ip.DetermineStability(ls); + return ip.IsDoneProfiling(ls, &is_stable); + }; }; TEST_CASE("testing the ValidLatencyMeasurement function") @@ -1397,5 +1483,160 @@ TEST_CASE("testing the ValidLatencyMeasurement function") CHECK(latencies[2] == convert_timestamp_to_latency(all_timestamps[3])); } +TEST_CASE("test_check_window_for_stability") +{ + LoadStatus ls; + LoadParams lp; + + SUBCASE("test throughput not stable") + { + ls.infer_per_sec = {1.0, 1000.0, 500.0}; + ls.latencies = {1, 1, 1}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == false); + } + SUBCASE("test throughput stable") + { + ls.infer_per_sec = {500.0, 520.0, 510.0}; + ls.latencies = {1, 1, 1}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == true); + } + SUBCASE("test latency not stable") + { + ls.infer_per_sec = {500.0, 520.0, 510.0}; + ls.latencies = {1, 100, 50}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == false); + } + SUBCASE("test latency stable") + { + ls.infer_per_sec = {500.0, 520.0, 510.0}; + ls.latencies = {45, 50, 45}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == true); + } + SUBCASE("test throughput stable after many measurements") + { + ls.infer_per_sec = {1.0, 1000.0, 500.0, 1500.0, 500.0, 520.0, 510.0}; + ls.latencies = {1, 1, 1, 1, 1, 1, 1}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == true); + } +} + +TEST_CASE("test check within threshold") +{ + LoadStatus ls; + LoadParams lp; + + ls.infer_per_sec = {500.0, 520.0, 510.0}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + uint64_t latency_threshold_ms = 1; + + SUBCASE("test not within threshold") + { + ls.latencies = {2000000, 2000000, 2000000}; + CHECK( + TestInferenceProfiler::TestCheckWithinThreshold( + ls, lp, latency_threshold_ms) == false); + } + + SUBCASE("test within threshold") + { + ls.latencies = {100000, 100000, 100000}; + CHECK( + TestInferenceProfiler::TestCheckWithinThreshold( + ls, lp, latency_threshold_ms) == true); + } +} + +TEST_CASE("test_determine_stability") +{ + LoadStatus ls; + LoadParams lp; + + SUBCASE("test inference equals zero") + { + ls.infer_per_sec = {500.0, 0.0, 510.0}; + ls.latencies = {1, 1, 1}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + uint64_t latency_threshold_ms = 1; + CHECK(TestInferenceProfiler::TestDetermineStability(ls, lp) == false); + + ls.infer_per_sec = {500.0, 520.0, 510.0}; + CHECK(TestInferenceProfiler::TestDetermineStability(ls, lp) == true); + } +} + +TEST_CASE("test_is_done_profiling") +{ + LoadStatus ls; + LoadParams lp; + + + SUBCASE("test latency_threshold is NO_LIMIT") + { + ls.infer_per_sec = {1.0, 1000.0, 500.0}; + ls.latencies = {1, 1, 1}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + uint64_t latency_threshold_ms = NO_LIMIT; + + CHECK( + TestInferenceProfiler::TestIsDoneProfiling( + ls, lp, latency_threshold_ms) == false); + } + + SUBCASE("test not within threshold from done profiling") + { + ls.infer_per_sec = {1.0, 1000.0, 500.0}; + ls.latencies = {2000000, 2000000, 2000000}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + uint64_t latency_threshold_ms = 1; + CHECK( + TestInferenceProfiler::TestIsDoneProfiling( + ls, lp, latency_threshold_ms) == true); + } + + SUBCASE("test stability from is done profiling") + { + ls.infer_per_sec = {1.0, 1000.0, 500.0}; + ls.latencies = {1, 1, 1}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + uint64_t latency_threshold_ms = 1; + + CHECK( + TestInferenceProfiler::TestIsDoneProfiling( + ls, lp, latency_threshold_ms) == false); + ls.infer_per_sec = {500.0, 520.0, 510.0}; + + CHECK( + TestInferenceProfiler::TestIsDoneProfiling( + ls, lp, latency_threshold_ms) == true); + } + + SUBCASE("test underflow") + { + ls.infer_per_sec = {500.0, 510.0}; + ls.latencies = {1, 1}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + uint64_t latency_threshold_ms = 1; + CHECK( + TestInferenceProfiler::TestIsDoneProfiling( + ls, lp, latency_threshold_ms) == false); + } +} + #endif }} // namespace triton::perfanalyzer diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h index c151460ae..ae7d8c217 100644 --- a/src/c++/perf_analyzer/inference_profiler.h +++ b/src/c++/perf_analyzer/inference_profiler.h @@ -35,6 +35,10 @@ namespace triton { namespace perfanalyzer { +#ifndef DOCTEST_CONFIG_DISABLE +class TestInferenceProfiler; +#endif + /// Constant parameters that determine the whether stopping criteria has met /// for the current phase of testing struct LoadParams { @@ -338,6 +342,45 @@ class InferenceProfiler { cb::Error ProfileHelper( const bool clean_starts, PerfStatus& status_summary, bool* is_stable); + /// A helper function to determine if profiling is stable + /// \param load_status Stores the observations of infer_per_sec and latencies + /// \return Returns if the threshold and latencies are stable. + bool DetermineStability(LoadStatus& load_status); + + /// Check if latency at index idx is within the latency threshold + /// \param idx index in latency vector + /// \param load_status Stores the observations of infer_per_sec and latencies + /// \return Returns whether the latencies are below the max threshold + bool CheckWithinThreshold(size_t idx, LoadStatus& load_status); + + /// A helper function to determine if profiling is done + /// \param load_status Stores the observations of infer_per_sec and latencies + /// \param is_stable Returns whether the measurement stabilized or not. + /// \return Returns if we should break out of the infinite stability check + /// loop. + bool IsDoneProfiling(LoadStatus& load_status, bool* is_stable); + + /// Check if observed inferences and latencies are within threshold + /// for a single window starting at idx + /// \param idx index in latency vector + /// \param load_status Stores the observations of infer_per_sec and latencies + /// \return Returns whether inference and latency are stable + bool CheckWindowForStability(size_t idx, LoadStatus& load_status); + + /// Check if observed inferences are within threshold + /// for a single window starting at idx + /// \param idx index in latency vector + /// \param load_status Stores the observations of infer_per_sec and latencies + /// \return Returns whether inference is stable + bool IsInferWindowStable(size_t idx, LoadStatus& load_status); + + /// Check if observed latencies are within threshold + /// for a single window starting at idx + /// \param idx index in latency vector + /// \param load_status Stores the observations of infer_per_sec and latencies + /// \return Returns whether latency is stable + bool IsLatencyWindowStable(size_t idx, LoadStatus& load_status); + /// Helper function to perform measurement. /// \param status_summary The summary of this measurement. /// \param measurement_window Indicating the number of requests or the @@ -500,5 +543,4 @@ class InferenceProfiler { InferenceProfiler() = default; #endif }; - }} // namespace triton::perfanalyzer diff --git a/src/c++/perf_analyzer/perf_analyzer.cc b/src/c++/perf_analyzer/perf_analyzer.cc index b49173c6e..96ae86cea 100644 --- a/src/c++/perf_analyzer/perf_analyzer.cc +++ b/src/c++/perf_analyzer/perf_analyzer.cc @@ -488,11 +488,9 @@ Usage(char** argv, const std::string& msg = std::string()) << FormatMessage( " --stability-percentage (-s): Indicates the allowed variation in " "latency measurements when determining if a result is stable. The " - "measurement is considered as stable if the recent 3 measurements " - "are within +/- (stability percentage)% of their average in terms " - "of both infer per second and latency. When perf analyzer " - "determines that the measurements are stable, it returns average " - "of the measurements collected in the last 3 windows. Default is " + "measurement is considered as stable if the ratio of max / min " + "from the recent 3 measurements is within (stability percentage)% " + "in terms of both infer per second and latency. Default is " "10(%).", 18) << std::endl;