Add request send rate detection (triton-inference-server#255)

* Add request send rate detection * Addressed comments * Addressed comments * Addressed comments
julianguinard · Mar 2, 2023 · a4952d8 · a4952d8
1 parent 2900c5e
commit a4952d8
Show file tree

Hide file tree

Showing 10 changed files with 187 additions and 3 deletions.
diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
@@ -106,6 +106,7 @@ InferContext::SendRequest(const uint64_t request_id, const bool delayed)
     return;
   }
 
+  thread_stat_->num_sent_requests_++;
   if (async_) {
     infer_data_.options_->request_id_ = std::to_string(request_id);
     {

diff --git a/src/c++/perf_analyzer/infer_context.h b/src/c++/perf_analyzer/infer_context.h
@@ -58,6 +58,8 @@ struct ThreadStat {
   TimestampVector request_timestamps_;
   // A lock to protect thread data
   std::mutex mu_;
+  // The number of sent requests by this thread.
+  std::atomic<size_t> num_sent_requests_{0};
 };
 
 /// The properties of an asynchronous request required in

diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1173,6 +1173,12 @@ InferenceProfiler::Summarize(
 
   SummarizeOverhead(window_duration_ns, manager_->GetIdleTime(), summary);
 
+  double window_duration_s{window_duration_ns /
+                           static_cast<double>(NANOS_PER_SECOND)};
+
+  SummarizeSendRequestRate(
+      window_duration_s, manager_->GetAndResetNumSentRequests(), summary);
+
   if (include_server_stats_) {
     RETURN_IF_ERROR(SummarizeServerStats(
         start_status, end_status, &(summary.server_stats)));
@@ -1337,6 +1343,18 @@ InferenceProfiler::SummarizeClientStat(
   return cb::Error::Success;
 }
 
+void
+InferenceProfiler::SummarizeSendRequestRate(
+    const double window_duration_s, const size_t num_sent_requests,
+    PerfStatus& summary)
+{
+  if (window_duration_s <= 0.0) {
+    throw std::runtime_error("window_duration_s must be positive");
+  }
+
+  summary.send_request_rate = num_sent_requests / window_duration_s;
+}
+
 cb::Error
 InferenceProfiler::SummarizeServerStatsHelper(
     const cb::ModelIdentifier& model_identifier,

diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -157,6 +157,8 @@ struct PerfStatus {
 
   // placeholder for the latency value that is used for conditional checking
   uint64_t stabilizing_latency_ns;
+  // Metric for requests sent per second
+  double send_request_rate{0.0};
 };
 
 cb::Error ReportPrometheusMetrics(const Metrics& metrics);
@@ -466,6 +468,16 @@ class InferenceProfiler {
       const size_t delayed_request_count, const size_t valid_sequence_count,
       PerfStatus& summary);
 
+  /// Adds the send request rate metric to the summary object.
+  /// \param window_duration_s The duration of the window in seconds.
+  /// \param num_sent_requests The number of requests sent during the last
+  /// window.
+  /// \param summary The summary object to be updated with the send request rate
+  /// metric.
+  void SummarizeSendRequestRate(
+      const double window_duration_s, const size_t num_sent_requests,
+      PerfStatus& summary);
+
   /// \param model_identifier A pair of model_name and model_version to identify
   /// a specific model.
   /// \param start_status The model status at the start of the measurement.

diff --git a/src/c++/perf_analyzer/load_manager.cc b/src/c++/perf_analyzer/load_manager.cc
@@ -145,6 +145,19 @@ LoadManager::ResetIdleTime()
   }
 }
 
+const size_t
+LoadManager::GetAndResetNumSentRequests()
+{
+  size_t num_sent_requests{0};
+
+  for (auto& thread_stat : threads_stat_) {
+    num_sent_requests += thread_stat->num_sent_requests_;
+    thread_stat->num_sent_requests_ = 0;
+  }
+
+  return num_sent_requests;
+}
+
 LoadManager::LoadManager(
     const bool async, const bool streaming, const int32_t batch_size,
     const size_t max_threads, const SharedMemoryType shared_memory_type,

diff --git a/src/c++/perf_analyzer/load_manager.h b/src/c++/perf_analyzer/load_manager.h
@@ -86,6 +86,11 @@ class LoadManager {
   ///
   void ResetIdleTime();
 
+  /// Calculates and returns the total number of sent requests across all
+  /// threads. Resets individual number of sent requests per thread.
+  /// \return The total number of sent requests across all threads.
+  const size_t GetAndResetNumSentRequests();
+
   /// \return the batch size used for the inference requests
   size_t BatchSize() const { return batch_size_; }
 

diff --git a/src/c++/perf_analyzer/test_concurrency_manager.cc b/src/c++/perf_analyzer/test_concurrency_manager.cc
@@ -83,6 +83,8 @@ class TestConcurrencyManager : public TestLoadManagerBase,
     }
   }
 
+  void StopWorkerThreads() { LoadManager::StopWorkerThreads(); }
+
   /// Test that the correct Infer function is called in the backend
   ///
   void TestInferType()
@@ -783,4 +785,31 @@ TEST_CASE("concurrency_overhead")
   tcm.TestOverhead();
 }
 
+TEST_CASE(
+    "send_request_rate_concurrency_manager: testing logic around detecting "
+    "send request count")
+{
+  PerfAnalyzerParameters params{};
+
+  SUBCASE("sync") { params.async = false; }
+  SUBCASE("async") { params.async = true; }
+
+  TestConcurrencyManager tcm(params);
+
+  tcm.stats_->SetDelays({10});
+
+  tcm.InitManager(
+      params.string_length, params.string_data, params.zero_input,
+      params.user_data, params.start_sequence_id, params.sequence_id_range,
+      params.sequence_length);
+
+  tcm.ChangeConcurrencyLevel(4);
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  tcm.StopWorkerThreads();
+
+  const size_t num_sent_requests{tcm.GetAndResetNumSentRequests()};
+
+  CHECK(num_sent_requests == doctest::Approx(40).epsilon(0.1));
+}
+
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -1,4 +1,4 @@
-// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -50,6 +50,14 @@ class TestInferenceProfiler : public InferenceProfiler {
     return inference_profiler.GetMeanAndStdDev(latencies);
   }
 
+  void SummarizeSendRequestRate(
+      const double window_duration_s, const size_t num_sent_requests,
+      PerfStatus& summary)
+  {
+    InferenceProfiler::SummarizeSendRequestRate(
+        window_duration_s, num_sent_requests, summary);
+  }
+
   static bool TestCheckWithinThreshold(
       LoadStatus& ls, LoadParams& lp, uint64_t latency_threshold_ms)
   {
@@ -679,4 +687,41 @@ TEST_CASE("InferenceProfiler: Test SummarizeOverhead")
   }
 }
 
+TEST_CASE(
+    "summarize_send_request_rate: testing the SummarizeSendRequestRate "
+    "function")
+{
+  TestInferenceProfiler tip{};
+  PerfStatus perf_status;
+
+  SUBCASE("invalid zero window duration")
+  {
+    double window_duration_s{0.0};
+    size_t num_sent_requests{0};
+    CHECK_THROWS_WITH_AS(
+        tip.SummarizeSendRequestRate(
+            window_duration_s, num_sent_requests, perf_status),
+        "window_duration_s must be positive", std::runtime_error);
+  }
+
+  SUBCASE("invalid negative window duration")
+  {
+    double window_duration_s{-1.0};
+    size_t num_sent_requests{0};
+    CHECK_THROWS_WITH_AS(
+        tip.SummarizeSendRequestRate(
+            window_duration_s, num_sent_requests, perf_status),
+        "window_duration_s must be positive", std::runtime_error);
+  }
+
+  SUBCASE("regular case")
+  {
+    double window_duration_s{2.0};
+    size_t num_sent_requests{100};
+    tip.SummarizeSendRequestRate(
+        window_duration_s, num_sent_requests, perf_status);
+    CHECK(perf_status.send_request_rate == doctest::Approx(50));
+  }
+}
+
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/test_load_manager.cc b/src/c++/perf_analyzer/test_load_manager.cc
@@ -47,6 +47,9 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
   {
   }
 
+  std::vector<std::shared_ptr<ThreadStat>>& threads_stat_{
+      LoadManager::threads_stat_};
+
   /// Test the public function CheckHealth
   ///
   /// It will return a bad result if any of the thread stats
@@ -387,4 +390,31 @@ TEST_CASE("load_manager: Test public idle time functions")
   tlm.TestIdle();
 }
 
+TEST_CASE(
+    "send_request_rate_load_manager: testing the GetAndResetNumSentRequests "
+    "function")
+{
+  PerfAnalyzerParameters params{};
+
+  TestLoadManager tlm(params);
+
+  std::shared_ptr<ThreadStat> thread_stat_1{std::make_shared<ThreadStat>()};
+  std::shared_ptr<ThreadStat> thread_stat_2{std::make_shared<ThreadStat>()};
+
+  std::chrono::steady_clock::time_point start_time{
+      std::chrono::steady_clock::time_point::min()};
+
+  thread_stat_1->num_sent_requests_ = 6;
+  thread_stat_2->num_sent_requests_ = 5;
+
+  tlm.threads_stat_ = {thread_stat_1, thread_stat_2};
+
+  const size_t result{tlm.GetAndResetNumSentRequests()};
+
+  CHECK(result == 11);
+  CHECK(tlm.threads_stat_.size() == 2);
+  CHECK(tlm.threads_stat_[0]->num_sent_requests_ == 0);
+  CHECK(tlm.threads_stat_[1]->num_sent_requests_ == 0);
+}
+
 }}  // namespace triton::perfanalyzer
diff --git a/src/c++/perf_analyzer/test_request_rate_manager.cc b/src/c++/perf_analyzer/test_request_rate_manager.cc
@@ -90,6 +90,8 @@ class TestRequestRateManager : public TestLoadManagerBase,
     }
   }
 
+  void StopWorkerThreads() { LoadManager::StopWorkerThreads(); }
+
   void TestSchedule(double rate, PerfAnalyzerParameters params)
   {
     PauseWorkers();
@@ -1120,4 +1122,31 @@ TEST_CASE("request_rate_overhead")
   trrm.TestOverhead(rate);
 }
 
+std::chrono::steady_clock::time_point mk_start{};
+
+TEST_CASE(
+    "send_request_rate_request_rate_manager: testing logic around detecting "
+    "send request count")
+{
+  PerfAnalyzerParameters params{};
+
+  SUBCASE("sync") { params.async = false; }
+  SUBCASE("async") { params.async = true; }
+
+  TestRequestRateManager trrm(params);
+
+  trrm.InitManager(
+      params.string_length, params.string_data, params.zero_input,
+      params.user_data, params.start_sequence_id, params.sequence_id_range,
+      params.sequence_length);
+
+  trrm.ChangeRequestRate(1000);
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  trrm.StopWorkerThreads();
+
+  const size_t num_sent_requests{trrm.GetAndResetNumSentRequests()};
+
+  CHECK(num_sent_requests == doctest::Approx(50).epsilon(0.1));
+}
+
 }}  // namespace triton::perfanalyzer
-Original file line number
+Diff line change
@@ Expand Up @@
         return;
       }
+      thread_stat_->num_sent_requests_++;
       if (async_) {
         infer_data_.options_->request_id_ = std::to_string(request_id);
         {
@@ Expand Down @@