Skip to content

Commit

Permalink
feat: add GPU clock monitoring to gpu_monitor (tier4#687)
Browse files Browse the repository at this point in the history
Signed-off-by: v-nakayama7440-esol <v-nakayama7440@esol.co.jp>

Signed-off-by: v-nakayama7440-esol <v-nakayama7440@esol.co.jp>
  • Loading branch information
v-nakayama7440-esol authored and boyali committed Oct 3, 2022
1 parent a00acab commit 8b69bd5
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@
contains: [": GPU Thermal Throttling"]
timeout: 3.0

frequency:
type: diagnostic_aggregator/GenericAnalyzer
path: frequency
contains: [": GPU Frequency"]
timeout: 3.0

memory:
type: diagnostic_aggregator/AnalyzerGroup
path: memory
Expand Down
2 changes: 1 addition & 1 deletion system/system_monitor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ Every topic is published in 1 minute interval.
| | GPU Usage ||| - | |
| | GPU Memory Usage || - | - | |
| | GPU Thermal Throttling || - | - | |
| | GPU Frequency | - || - | |
| | GPU Frequency | || - | For Intel platform, monitor whether current GPU clock is supported by the GPU. |

## ROS parameters

Expand Down
21 changes: 19 additions & 2 deletions system/system_monitor/docs/topics_gpu_monitor.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,27 @@

## <u>GPU Frequency</u>

> Tegra platform only.
/diagnostics/gpu_monitor: GPU Frequency

### Intel platform

<b>[summary]</b>

| level | message |
| ----- | ----------------- |
| OK | OK |
| WARN | unsupported clock |

<b>[values]</b>

| key | value (example) |
| ------------------------- | ---------------------- |
| GPU [0-9]: status | OK / unsupported clock |
| GPU [0-9]: name | GeForce GTX 1650 |
| GPU [0-9]: graphics clock | 1020 MHz |

### Tegra platform

<b>[summary]</b>

| level | message |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <nvml.h>

#include <map>
#include <set>
#include <string>
#include <vector>

Expand All @@ -45,10 +46,11 @@
*/
struct gpu_info
{
nvmlDevice_t device; //!< @brief handle for a particular device
char name[NVML_DEVICE_NAME_BUFFER_SIZE]; //!< @brief name of device
nvmlPciInfo_t pci; //!< @brief PCI information about a GPU device
nvmlUtilization_t utilization; //!< @brief Utilization information for a device
nvmlDevice_t device; //!< @brief handle for a particular device
char name[NVML_DEVICE_NAME_BUFFER_SIZE]; //!< @brief name of device
nvmlPciInfo_t pci; //!< @brief PCI information about a GPU device
nvmlUtilization_t utilization; //!< @brief Utilization information for a device
std::set<unsigned int> supported_gpu_clocks; //!< @brief list of supported GPU clocks
};

class GPUMonitor : public GPUMonitorBase
Expand Down Expand Up @@ -123,11 +125,36 @@ class GPUMonitor : public GPUMonitorBase
*/
std::string toHumanReadable(unsigned long long size); // NOLINT(runtime/int)

/**
* @brief check GPU frequency
* @param [out] stat diagnostic message passed directly to diagnostic publish calls
* @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference
* to pass diagnostic message updated in this function to diagnostic publish calls.
*/
void checkFrequency(
diagnostic_updater::DiagnosticStatusWrapper & stat) override; // NOLINT(runtime/references)

/**
* @brief get supported GPU clocks
* @param [in] index GPU index
* @param [in] device GPU device
* @param [out] list of supported GPU clocks
* @return result of getting supported GPU clocks
*/
bool getSupportedGPUClocks(
int index, nvmlDevice_t & device, std::set<unsigned int> & supported_gpu_clocks);

static const size_t MAX_ARRAY_SIZE = 64;
static const size_t MAX_NAME_LENGTH = 128;

std::vector<gpu_info> gpus_; //!< @brief list of gpus
uint64_t current_timestamp_ = 0; //!< @brief latest timestamp[usec] of addProcessUsage()

/**
* @brief GPU frequency status messages
*/
const std::map<int, const char *> frequency_dict_ = {
{DiagStatus::OK, "OK"}, {DiagStatus::WARN, "unsupported clock"}};
};

#endif // SYSTEM_MONITOR__GPU_MONITOR__NVML_GPU_MONITOR_HPP_
102 changes: 99 additions & 3 deletions system/system_monitor/src/gpu_monitor/nvml_gpu_monitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@

GPUMonitor::GPUMonitor(const rclcpp::NodeOptions & options) : GPUMonitorBase("gpu_monitor", options)
{
// Include frequency into GPU Thermal Throttling thus remove.
updater_.removeByName("GPU Frequency");

nvmlReturn_t ret = nvmlInit();
if (ret != NVML_SUCCESS) {
RCLCPP_ERROR(this->get_logger(), "Failed to initialize NVML: %s\n", nvmlErrorString(ret));
Expand Down Expand Up @@ -73,6 +70,9 @@ GPUMonitor::GPUMonitor(const rclcpp::NodeOptions & options) : GPUMonitorBase("gp
nvmlErrorString(ret));
continue;
}
if (!getSupportedGPUClocks(index, info.device, info.supported_gpu_clocks)) {
continue;
}
gpus_.push_back(info);
}
}
Expand Down Expand Up @@ -397,5 +397,101 @@ std::string GPUMonitor::toHumanReadable(unsigned long long size) // NOLINT
return fmt::format(format, dsize, units[count]);
}

void GPUMonitor::checkFrequency(diagnostic_updater::DiagnosticStatusWrapper & stat)
{
// Remember start time to measure elapsed time
const auto t_start = SystemMonitorUtility::startMeasurement();

int whole_level = DiagStatus::OK;
int index = 0;
nvmlReturn_t ret{};

if (gpus_.empty()) {
stat.summary(DiagStatus::ERROR, "gpu not found");
return;
}

for (auto itr = gpus_.begin(); itr != gpus_.end(); ++itr, ++index) {
int level = DiagStatus::OK;
unsigned int clock = 0;
ret = nvmlDeviceGetClockInfo(itr->device, NVML_CLOCK_GRAPHICS, &clock);
if (ret != NVML_SUCCESS) {
stat.summary(DiagStatus::ERROR, "Failed to retrieve the current clock speeds");
stat.add(fmt::format("GPU {}: name", index), itr->name);
stat.add(fmt::format("GPU {}: bus-id", index), itr->pci.busId);
stat.add(fmt::format("GPU {}: content", index), nvmlErrorString(ret));
return;
}

if (itr->supported_gpu_clocks.find(clock) == itr->supported_gpu_clocks.end()) {
level = DiagStatus::WARN;
}

stat.add(fmt::format("GPU {}: status", index), frequency_dict_.at(level));
stat.add(fmt::format("GPU {}: name", index), itr->name);
stat.addf(fmt::format("GPU {}: graphics clock", index), "%d MHz", clock);

whole_level = std::max(whole_level, level);
}

stat.summary(whole_level, frequency_dict_.at(whole_level));

// Measure elapsed time since start time and report
SystemMonitorUtility::stopMeasurement(t_start, stat);
}

bool GPUMonitor::getSupportedGPUClocks(
int index, nvmlDevice_t & device, std::set<unsigned int> & supported_gpu_clocks)
{
unsigned int mem_clock_count = 0;
nvmlReturn_t ret{};

ret = nvmlDeviceGetSupportedMemoryClocks(device, &mem_clock_count, nullptr);
if (ret != NVML_ERROR_INSUFFICIENT_SIZE) {
RCLCPP_ERROR(
this->get_logger(), "Failed to retrieve the count of possible memory clocks [%d]: %s", index,
nvmlErrorString(ret));
return false;
}

std::shared_ptr<unsigned int[]> mem_clocks(new unsigned int[mem_clock_count]);
ret = nvmlDeviceGetSupportedMemoryClocks(device, &mem_clock_count, mem_clocks.get());
if (ret != NVML_SUCCESS) {
RCLCPP_ERROR(
this->get_logger(), "Failed to retrieve the list of possible memory clocks [%d]: %s", index,
nvmlErrorString(ret));
return false;
}

for (unsigned int mem_clock_index = 0; mem_clock_index < mem_clock_count; mem_clock_index++) {
unsigned int gpu_clock_count = 0;

ret = nvmlDeviceGetSupportedGraphicsClocks(
device, mem_clocks[mem_clock_index], &gpu_clock_count, nullptr);
if (ret != NVML_ERROR_INSUFFICIENT_SIZE) {
RCLCPP_ERROR(
this->get_logger(),
"Failed to retrieve the count of possible graphics clocks for %d MHz memory clock [%d]: %s",
mem_clocks[mem_clock_index], index, nvmlErrorString(ret));
return false;
}

std::shared_ptr<unsigned int[]> gpu_clocks(new unsigned int[gpu_clock_count]);
ret = nvmlDeviceGetSupportedGraphicsClocks(
device, mem_clocks[mem_clock_index], &gpu_clock_count, gpu_clocks.get());
if (ret != NVML_SUCCESS) {
RCLCPP_ERROR(
this->get_logger(),
"Failed to retrieve the list of possible graphics clocks for %d MHz memory clock [%d]: %s",
mem_clocks[mem_clock_index], index, nvmlErrorString(ret));
return false;
}
for (unsigned int gpu_clock_index = 0; gpu_clock_index < gpu_clock_count; gpu_clock_index++) {
supported_gpu_clocks.insert(gpu_clocks[gpu_clock_index]);
}
}
return true;
}

#include <rclcpp_components/register_node_macro.hpp>
RCLCPP_COMPONENTS_REGISTER_NODE(GPUMonitor)

0 comments on commit 8b69bd5

Please sign in to comment.