Skip to content

Commit

Permalink
Add request send rate detection (triton-inference-server#255)
Browse files Browse the repository at this point in the history
* Add request send rate detection

* Addressed comments

* Addressed comments

* Addressed comments
  • Loading branch information
matthewkotila authored Mar 2, 2023
1 parent 2900c5e commit a4952d8
Show file tree
Hide file tree
Showing 10 changed files with 187 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/c++/perf_analyzer/infer_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ InferContext::SendRequest(const uint64_t request_id, const bool delayed)
return;
}

thread_stat_->num_sent_requests_++;
if (async_) {
infer_data_.options_->request_id_ = std::to_string(request_id);
{
Expand Down
2 changes: 2 additions & 0 deletions src/c++/perf_analyzer/infer_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ struct ThreadStat {
TimestampVector request_timestamps_;
// A lock to protect thread data
std::mutex mu_;
// The number of sent requests by this thread.
std::atomic<size_t> num_sent_requests_{0};
};

/// The properties of an asynchronous request required in
Expand Down
20 changes: 19 additions & 1 deletion src/c++/perf_analyzer/inference_profiler.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -1173,6 +1173,12 @@ InferenceProfiler::Summarize(

SummarizeOverhead(window_duration_ns, manager_->GetIdleTime(), summary);

double window_duration_s{window_duration_ns /
static_cast<double>(NANOS_PER_SECOND)};

SummarizeSendRequestRate(
window_duration_s, manager_->GetAndResetNumSentRequests(), summary);

if (include_server_stats_) {
RETURN_IF_ERROR(SummarizeServerStats(
start_status, end_status, &(summary.server_stats)));
Expand Down Expand Up @@ -1337,6 +1343,18 @@ InferenceProfiler::SummarizeClientStat(
return cb::Error::Success;
}

void
InferenceProfiler::SummarizeSendRequestRate(
const double window_duration_s, const size_t num_sent_requests,
PerfStatus& summary)
{
if (window_duration_s <= 0.0) {
throw std::runtime_error("window_duration_s must be positive");
}

summary.send_request_rate = num_sent_requests / window_duration_s;
}

cb::Error
InferenceProfiler::SummarizeServerStatsHelper(
const cb::ModelIdentifier& model_identifier,
Expand Down
14 changes: 13 additions & 1 deletion src/c++/perf_analyzer/inference_profiler.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -157,6 +157,8 @@ struct PerfStatus {

// placeholder for the latency value that is used for conditional checking
uint64_t stabilizing_latency_ns;
// Metric for requests sent per second
double send_request_rate{0.0};
};

cb::Error ReportPrometheusMetrics(const Metrics& metrics);
Expand Down Expand Up @@ -466,6 +468,16 @@ class InferenceProfiler {
const size_t delayed_request_count, const size_t valid_sequence_count,
PerfStatus& summary);

/// Adds the send request rate metric to the summary object.
/// \param window_duration_s The duration of the window in seconds.
/// \param num_sent_requests The number of requests sent during the last
/// window.
/// \param summary The summary object to be updated with the send request rate
/// metric.
void SummarizeSendRequestRate(
const double window_duration_s, const size_t num_sent_requests,
PerfStatus& summary);

/// \param model_identifier A pair of model_name and model_version to identify
/// a specific model.
/// \param start_status The model status at the start of the measurement.
Expand Down
13 changes: 13 additions & 0 deletions src/c++/perf_analyzer/load_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,19 @@ LoadManager::ResetIdleTime()
}
}

const size_t
LoadManager::GetAndResetNumSentRequests()
{
size_t num_sent_requests{0};

for (auto& thread_stat : threads_stat_) {
num_sent_requests += thread_stat->num_sent_requests_;
thread_stat->num_sent_requests_ = 0;
}

return num_sent_requests;
}

LoadManager::LoadManager(
const bool async, const bool streaming, const int32_t batch_size,
const size_t max_threads, const SharedMemoryType shared_memory_type,
Expand Down
5 changes: 5 additions & 0 deletions src/c++/perf_analyzer/load_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ class LoadManager {
///
void ResetIdleTime();

/// Calculates and returns the total number of sent requests across all
/// threads. Resets individual number of sent requests per thread.
/// \return The total number of sent requests across all threads.
const size_t GetAndResetNumSentRequests();

/// \return the batch size used for the inference requests
size_t BatchSize() const { return batch_size_; }

Expand Down
29 changes: 29 additions & 0 deletions src/c++/perf_analyzer/test_concurrency_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class TestConcurrencyManager : public TestLoadManagerBase,
}
}

void StopWorkerThreads() { LoadManager::StopWorkerThreads(); }

/// Test that the correct Infer function is called in the backend
///
void TestInferType()
Expand Down Expand Up @@ -783,4 +785,31 @@ TEST_CASE("concurrency_overhead")
tcm.TestOverhead();
}

TEST_CASE(
"send_request_rate_concurrency_manager: testing logic around detecting "
"send request count")
{
PerfAnalyzerParameters params{};

SUBCASE("sync") { params.async = false; }
SUBCASE("async") { params.async = true; }

TestConcurrencyManager tcm(params);

tcm.stats_->SetDelays({10});

tcm.InitManager(
params.string_length, params.string_data, params.zero_input,
params.user_data, params.start_sequence_id, params.sequence_id_range,
params.sequence_length);

tcm.ChangeConcurrencyLevel(4);
std::this_thread::sleep_for(std::chrono::milliseconds(100));
tcm.StopWorkerThreads();

const size_t num_sent_requests{tcm.GetAndResetNumSentRequests()};

CHECK(num_sent_requests == doctest::Approx(40).epsilon(0.1));
}

}} // namespace triton::perfanalyzer
47 changes: 46 additions & 1 deletion src/c++/perf_analyzer/test_inference_profiler.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -50,6 +50,14 @@ class TestInferenceProfiler : public InferenceProfiler {
return inference_profiler.GetMeanAndStdDev(latencies);
}

void SummarizeSendRequestRate(
const double window_duration_s, const size_t num_sent_requests,
PerfStatus& summary)
{
InferenceProfiler::SummarizeSendRequestRate(
window_duration_s, num_sent_requests, summary);
}

static bool TestCheckWithinThreshold(
LoadStatus& ls, LoadParams& lp, uint64_t latency_threshold_ms)
{
Expand Down Expand Up @@ -679,4 +687,41 @@ TEST_CASE("InferenceProfiler: Test SummarizeOverhead")
}
}

TEST_CASE(
"summarize_send_request_rate: testing the SummarizeSendRequestRate "
"function")
{
TestInferenceProfiler tip{};
PerfStatus perf_status;

SUBCASE("invalid zero window duration")
{
double window_duration_s{0.0};
size_t num_sent_requests{0};
CHECK_THROWS_WITH_AS(
tip.SummarizeSendRequestRate(
window_duration_s, num_sent_requests, perf_status),
"window_duration_s must be positive", std::runtime_error);
}

SUBCASE("invalid negative window duration")
{
double window_duration_s{-1.0};
size_t num_sent_requests{0};
CHECK_THROWS_WITH_AS(
tip.SummarizeSendRequestRate(
window_duration_s, num_sent_requests, perf_status),
"window_duration_s must be positive", std::runtime_error);
}

SUBCASE("regular case")
{
double window_duration_s{2.0};
size_t num_sent_requests{100};
tip.SummarizeSendRequestRate(
window_duration_s, num_sent_requests, perf_status);
CHECK(perf_status.send_request_rate == doctest::Approx(50));
}
}

}} // namespace triton::perfanalyzer
30 changes: 30 additions & 0 deletions src/c++/perf_analyzer/test_load_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
{
}

std::vector<std::shared_ptr<ThreadStat>>& threads_stat_{
LoadManager::threads_stat_};

/// Test the public function CheckHealth
///
/// It will return a bad result if any of the thread stats
Expand Down Expand Up @@ -387,4 +390,31 @@ TEST_CASE("load_manager: Test public idle time functions")
tlm.TestIdle();
}

TEST_CASE(
"send_request_rate_load_manager: testing the GetAndResetNumSentRequests "
"function")
{
PerfAnalyzerParameters params{};

TestLoadManager tlm(params);

std::shared_ptr<ThreadStat> thread_stat_1{std::make_shared<ThreadStat>()};
std::shared_ptr<ThreadStat> thread_stat_2{std::make_shared<ThreadStat>()};

std::chrono::steady_clock::time_point start_time{
std::chrono::steady_clock::time_point::min()};

thread_stat_1->num_sent_requests_ = 6;
thread_stat_2->num_sent_requests_ = 5;

tlm.threads_stat_ = {thread_stat_1, thread_stat_2};

const size_t result{tlm.GetAndResetNumSentRequests()};

CHECK(result == 11);
CHECK(tlm.threads_stat_.size() == 2);
CHECK(tlm.threads_stat_[0]->num_sent_requests_ == 0);
CHECK(tlm.threads_stat_[1]->num_sent_requests_ == 0);
}

}} // namespace triton::perfanalyzer
29 changes: 29 additions & 0 deletions src/c++/perf_analyzer/test_request_rate_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ class TestRequestRateManager : public TestLoadManagerBase,
}
}

void StopWorkerThreads() { LoadManager::StopWorkerThreads(); }

void TestSchedule(double rate, PerfAnalyzerParameters params)
{
PauseWorkers();
Expand Down Expand Up @@ -1120,4 +1122,31 @@ TEST_CASE("request_rate_overhead")
trrm.TestOverhead(rate);
}

std::chrono::steady_clock::time_point mk_start{};

TEST_CASE(
"send_request_rate_request_rate_manager: testing logic around detecting "
"send request count")
{
PerfAnalyzerParameters params{};

SUBCASE("sync") { params.async = false; }
SUBCASE("async") { params.async = true; }

TestRequestRateManager trrm(params);

trrm.InitManager(
params.string_length, params.string_data, params.zero_input,
params.user_data, params.start_sequence_id, params.sequence_id_range,
params.sequence_length);

trrm.ChangeRequestRate(1000);
std::this_thread::sleep_for(std::chrono::milliseconds(50));
trrm.StopWorkerThreads();

const size_t num_sent_requests{trrm.GetAndResetNumSentRequests()};

CHECK(num_sent_requests == doctest::Approx(50).epsilon(0.1));
}

}} // namespace triton::perfanalyzer

0 comments on commit a4952d8

Please sign in to comment.