Skip to content

Commit

Permalink
Allow changing metrics update frequency by specifying metrics-interva…
Browse files Browse the repository at this point in the history
…l-secs (triton-inference-server#3338)

* Allow user to change metrics update frequency by specifying metrics-interval-secs

* Changed metrics interval unit to ms, halved sampling time

* Mentioned defaults, removed type cast
  • Loading branch information
Ashwin Ramesh authored Sep 10, 2021
1 parent 1ee8abe commit e65d6c6
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 11 deletions.
29 changes: 29 additions & 0 deletions qa/L0_metrics/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,35 @@ kill $SERVER_PID
wait $SERVER_PID


# Test metrics interval by querying host and checking energy
METRICS_INTERVAL_MS=500
METRICS_INTERVAL_SECS=0.5

SERVER_ARGS="$SERVER_ARGS --metrics-interval-ms=${METRICS_INTERVAL_MS}"
run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

num_iterations=10
prev_energy=`curl -s localhost:8002/metrics | awk '/nv_energy_consumption{/ {print $2}'`
for (( i = 0; i < $num_iterations; ++i )); do
sleep $METRICS_INTERVAL_SECS
current_energy=`curl -s localhost:8002/metrics | awk '/nv_energy_consumption{/ {print $2}'`
if [ $current_energy == $prev_energy ]; then
echo "Metrics were not updated in interval of ${METRICS_INTERVAL_MS} seconds"
echo -e "\n***\n*** Metric Interval test failed. \n***"
RET=1
break
fi
prev_energy=$current_energy
done

kill $SERVER_PID
wait $SERVER_PID

if [ $RET -eq 0 ]; then
echo -e "\n***\n*** Test Passed\n***"
else
Expand Down
16 changes: 13 additions & 3 deletions src/core/metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ Metrics::Metrics()
"started")
.Register(*registry_)),
#endif // TRITON_ENABLE_METRICS_GPU
metrics_enabled_(false), gpu_metrics_enabled_(false)
metrics_enabled_(false), gpu_metrics_enabled_(false),
metrics_interval_ms_(2000)
{
}

Expand Down Expand Up @@ -188,6 +189,13 @@ Metrics::EnableGPUMetrics()
singleton->gpu_metrics_enabled_ = true;
}

void
Metrics::SetMetricsInterval(uint64_t metrics_interval_ms)
{
auto singleton = GetSingleton();
singleton->metrics_interval_ms_ = metrics_interval_ms;
}

bool
Metrics::InitializeDcgmMetrics()
{
Expand Down Expand Up @@ -355,13 +363,15 @@ Metrics::InitializeDcgmMetrics()
LOG_WARNING << "Cannot make field group: " << errorString(dcgmerr);
}
dcgmerr = dcgmWatchFields(
handle, groupId, fieldGroupId, 2000000 /*update period, usec*/,
handle, groupId, fieldGroupId,
metrics_interval_ms_ * 1000 /*update period, usec*/,
5.0 /*maxKeepAge, sec*/, 5 /*maxKeepSamples*/);
if (dcgmerr != DCGM_ST_OK) {
LOG_WARNING << "Cannot start watching fields: " << errorString(dcgmerr);
} else {
while (!dcgm_thread_exit_.load()) {
std::this_thread::sleep_for(std::chrono::milliseconds(2000));
std::this_thread::sleep_for(
std::chrono::milliseconds(metrics_interval_ms_ / 2));
dcgmUpdateAllFields(handle, 1 /* wait for update*/);
for (int didx = 0; didx < available_cuda_gpu_count; ++didx) {
uint32_t cuda_id = available_cuda_gpu_ids[didx];
Expand Down
4 changes: 4 additions & 0 deletions src/core/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ class Metrics {
// Enable reporting of GPU metrics
static void EnableGPUMetrics();

// Set the time interval in secs at which metrics are collected
static void SetMetricsInterval(uint64_t metrics_interval_ms);

// Get the prometheus registry
static std::shared_ptr<prometheus::Registry> GetRegistry();

Expand Down Expand Up @@ -173,6 +176,7 @@ class Metrics {
bool metrics_enabled_;
bool gpu_metrics_enabled_;
std::mutex gpu_metrics_enabling_;
uint64_t metrics_interval_ms_;
};

}} // namespace nvidia::inferenceserver
Expand Down
4 changes: 2 additions & 2 deletions src/core/model_config_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1254,12 +1254,12 @@ ValidateModelConfig(
" has kind KIND_MODEL but specifies one or more GPUs");
}
} else if (group.kind() == inference::ModelInstanceGroup::KIND_GPU) {
#if ! defined (TRITON_ENABLE_GPU) && ! defined (TRITON_ENABLE_MALI_GPU)
#if !defined(TRITON_ENABLE_GPU) && !defined(TRITON_ENABLE_MALI_GPU)
return Status(
Status::Code::INVALID_ARG,
"instance group " + group.name() + " of model " + config.name() +
" has kind KIND_GPU but server does not support GPUs");
#elif defined (TRITON_ENABLE_GPU)
#elif defined(TRITON_ENABLE_GPU)
if (group.gpus().size() == 0) {
if (supported_gpus.size() == 0) {
return Status(
Expand Down
2 changes: 1 addition & 1 deletion src/core/model_config_utils.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2018-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2018-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down
24 changes: 22 additions & 2 deletions src/core/tritonserver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,9 @@ class TritonServerOptions {
bool GpuMetrics() const { return gpu_metrics_; }
void SetGpuMetrics(bool b) { gpu_metrics_ = b; }

uint64_t MetricsInterval() const { return metrics_interval_; }
void SetMetricsInterval(uint64_t m) { metrics_interval_ = m; }

const std::string& BackendDir() const { return backend_dir_; }
void SetBackendDir(const std::string& bd)
{
Expand Down Expand Up @@ -319,6 +322,7 @@ class TritonServerOptions {
ni::RateLimiter::ResourceMap rate_limit_resource_map_;
bool metrics_;
bool gpu_metrics_;
uint64_t metrics_interval_;
unsigned int exit_timeout_;
uint64_t pinned_memory_pool_size_;
unsigned int buffer_manager_thread_count_;
Expand All @@ -338,8 +342,8 @@ TritonServerOptions::TritonServerOptions()
model_control_mode_(ni::ModelControlMode::MODE_POLL),
exit_on_error_(true), strict_model_config_(true), strict_readiness_(true),
rate_limit_mode_(ni::RateLimitMode::RL_OFF), metrics_(true),
gpu_metrics_(true), exit_timeout_(30), pinned_memory_pool_size_(1 << 28),
buffer_manager_thread_count_(0),
gpu_metrics_(true), metrics_interval_(2000), exit_timeout_(30),
pinned_memory_pool_size_(1 << 28), buffer_manager_thread_count_(0),
#ifdef TRITON_ENABLE_GPU
min_compute_capability_(TRITON_MIN_COMPUTE_CAPABILITY),
#else
Expand Down Expand Up @@ -1217,6 +1221,21 @@ TRITONSERVER_ServerOptionsSetGpuMetrics(
#endif // TRITON_ENABLE_METRICS
}

TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetMetricsInterval(
TRITONSERVER_ServerOptions* options, uint64_t metrics_interval_ms)
{
#ifdef TRITON_ENABLE_METRICS
TritonServerOptions* loptions =
reinterpret_cast<TritonServerOptions*>(options);
loptions->SetMetricsInterval(metrics_interval_ms);
return nullptr; // Success
#else
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED, "metrics not supported");
#endif // TRITON_ENABLE_METRICS
}

TRITONSERVER_Error*
TRITONSERVER_ServerOptionsSetBackendDirectory(
TRITONSERVER_ServerOptions* options, const char* backend_dir)
Expand Down Expand Up @@ -1705,6 +1724,7 @@ TRITONSERVER_ServerNew(
#ifdef TRITON_ENABLE_METRICS
if (loptions->Metrics()) {
ni::Metrics::EnableMetrics();
ni::Metrics::SetMetricsInterval(loptions->MetricsInterval());
}
#ifdef TRITON_ENABLE_METRICS_GPU
if (loptions->Metrics() && loptions->GpuMetrics()) {
Expand Down
2 changes: 0 additions & 2 deletions src/servers/http_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ HTTPMetricsServer::Handle(evhtp_request_t* req)
evhtp_send_reply(req, res);
}

#ifdef TRITON_ENABLE_METRICS
TRITONSERVER_Error*
HTTPMetricsServer::Create(
const std::shared_ptr<TRITONSERVER_Server>& server, const int32_t port,
Expand All @@ -163,7 +162,6 @@ HTTPMetricsServer::Create(

return nullptr;
}
#endif // TRITON_ENABLE_METRICS

#endif // TRITON_ENABLE_METRICS

Expand Down
14 changes: 14 additions & 0 deletions src/servers/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ nvidia::inferenceserver::KeepAliveOptions grpc_keepalive_options_;
std::unique_ptr<nvidia::inferenceserver::HTTPServer> metrics_service_;
bool allow_metrics_ = true;
int32_t metrics_port_ = 8002;
float metrics_interval_ms_ = 2000;
#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_TRACING
Expand Down Expand Up @@ -236,6 +237,7 @@ enum OptionId {
OPTION_ALLOW_METRICS,
OPTION_ALLOW_GPU_METRICS,
OPTION_METRICS_PORT,
OPTION_METRICS_INTERVAL_MS,
#endif // TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_TRACING
OPTION_TRACE_FILEPATH,
Expand Down Expand Up @@ -410,6 +412,9 @@ std::vector<Option> options_
"--allow-metrics is true."},
{OPTION_METRICS_PORT, "metrics-port", Option::ArgInt,
"The port reporting prometheus metrics."},
{OPTION_METRICS_INTERVAL_MS, "metrics-interval-ms", Option::ArgFloat,
"Metrics will be collected once every <metrics-interval-ms> "
"milliseconds. Default is 2000 milliseconds."},
#endif // TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_TRACING
{OPTION_TRACE_FILEPATH, "trace-file", Option::ArgStr,
Expand Down Expand Up @@ -1162,6 +1167,7 @@ Parse(TRITONSERVER_ServerOptions** server_options, int argc, char** argv)
#ifdef TRITON_ENABLE_METRICS
int32_t metrics_port = metrics_port_;
bool allow_gpu_metrics = true;
float metrics_interval_ms = metrics_interval_ms_;
#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_TRACING
Expand Down Expand Up @@ -1340,6 +1346,9 @@ Parse(TRITONSERVER_ServerOptions** server_options, int argc, char** argv)
case OPTION_METRICS_PORT:
metrics_port = ParseIntOption(optarg);
break;
case OPTION_METRICS_INTERVAL_MS:
metrics_interval_ms = ParseIntOption(optarg);
break;
#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_TRACING
Expand Down Expand Up @@ -1484,6 +1493,7 @@ Parse(TRITONSERVER_ServerOptions** server_options, int argc, char** argv)
#ifdef TRITON_ENABLE_METRICS
metrics_port_ = metrics_port;
allow_gpu_metrics = allow_metrics_ ? allow_gpu_metrics : false;
metrics_interval_ms_ = metrics_interval_ms;
#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_TRACING
Expand Down Expand Up @@ -1582,6 +1592,10 @@ Parse(TRITONSERVER_ServerOptions** server_options, int argc, char** argv)
FAIL_IF_ERR(
TRITONSERVER_ServerOptionsSetGpuMetrics(loptions, allow_gpu_metrics),
"setting GPU metrics enable");
FAIL_IF_ERR(
TRITONSERVER_ServerOptionsSetMetricsInterval(
loptions, metrics_interval_ms_),
"setting metrics interval");
#endif // TRITON_ENABLE_METRICS

FAIL_IF_ERR(
Expand Down
2 changes: 1 addition & 1 deletion src/servers/signal.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down

0 comments on commit e65d6c6

Please sign in to comment.