From b467e732786b303b555f7db5b0b1dc408c1076e3 Mon Sep 17 00:00:00 2001 From: limengxuan Date: Tue, 30 Jan 2024 18:45:45 +0800 Subject: [PATCH] fix deviceplugin not launch on cuda 460 Signed-off-by: limengxuan --- cmd/vGPUmonitor/metrics.go | 18 ++++++++++++++---- .../nvidiadevice/nvinternal/plugin/register.go | 16 ++++++++++++---- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/cmd/vGPUmonitor/metrics.go b/cmd/vGPUmonitor/metrics.go index d2405397..9b607424 100644 --- a/cmd/vGPUmonitor/metrics.go +++ b/cmd/vGPUmonitor/metrics.go @@ -164,10 +164,20 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) { if nvret != nvml.SUCCESS { klog.Error(nvml.ErrorString(nvret)) } - memory, nvret := hdev.GetMemoryInfo_v2() - if nvret != nvml.SUCCESS { - klog.Error(nvml.ErrorString(nvret)) + memoryUsed := 0 + memory, ret := hdev.GetMemoryInfo_v2() + if ret == nvml.SUCCESS { + memoryUsed = int(memory.Used) + } else { + klog.Error("nvml get memory_v2 error ret=", ret) + memory_v1, ret := hdev.GetMemoryInfo() + if ret != nvml.SUCCESS { + klog.Error("nvml get memory error ret=", ret) + } else { + memoryUsed = int(memory_v1.Used) + } } + uuid, nvret := hdev.GetUUID() if nvret != nvml.SUCCESS { klog.Error(nvml.ErrorString(nvret)) @@ -175,7 +185,7 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric( hostGPUdesc, prometheus.GaugeValue, - float64(memory.Used), + float64(memoryUsed), fmt.Sprint(ii), uuid, ) } diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go index 56614bc6..ed0e1182 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go @@ -106,10 +106,18 @@ func (r *NvidiaDevicePlugin) getApiDevices() *[]*api.DeviceInfo { klog.Errorln("nvml new device by index error idx=", idx, "err=", ret) panic(0) } + memoryTotal := 0 memory, ret := ndev.GetMemoryInfo_v2() - if ret != nvml.SUCCESS { - klog.Error("nvml get memory error ret=", ret) - panic(0) + if ret == nvml.SUCCESS { + memoryTotal = int(memory.Total) + } else { + klog.Error("nvml get memory_v2 error ret=", ret) + memory_v1, ret := ndev.GetMemoryInfo() + if ret != nvml.SUCCESS { + klog.Error("nvml get memory_v2 error ret=", ret) + panic(0) + } + memoryTotal = int(memory_v1.Total) } UUID, ret := ndev.GetUUID() if ret != nvml.SUCCESS { @@ -122,7 +130,7 @@ func (r *NvidiaDevicePlugin) getApiDevices() *[]*api.DeviceInfo { panic(0) } - registeredmem := int32(memory.Total / 1024 / 1024) + registeredmem := int32(memoryTotal / 1024 / 1024) if *util.DeviceMemoryScaling != 1 { registeredmem = int32(float64(registeredmem) * *util.DeviceMemoryScaling) }