Skip to content

Commit

Permalink
fix deviceplugin not launch on cuda 460
Browse files Browse the repository at this point in the history
Signed-off-by: limengxuan <limengxuan@4paradigm.com>
  • Loading branch information
archlitchi committed Jan 30, 2024
1 parent 699a18b commit b467e73
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 8 deletions.
18 changes: 14 additions & 4 deletions cmd/vGPUmonitor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,28 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
if nvret != nvml.SUCCESS {
klog.Error(nvml.ErrorString(nvret))
}
memory, nvret := hdev.GetMemoryInfo_v2()
if nvret != nvml.SUCCESS {
klog.Error(nvml.ErrorString(nvret))
memoryUsed := 0
memory, ret := hdev.GetMemoryInfo_v2()
if ret == nvml.SUCCESS {
memoryUsed = int(memory.Used)
} else {
klog.Error("nvml get memory_v2 error ret=", ret)
memory_v1, ret := hdev.GetMemoryInfo()
if ret != nvml.SUCCESS {
klog.Error("nvml get memory error ret=", ret)
} else {
memoryUsed = int(memory_v1.Used)
}
}

uuid, nvret := hdev.GetUUID()
if nvret != nvml.SUCCESS {
klog.Error(nvml.ErrorString(nvret))
} else {
ch <- prometheus.MustNewConstMetric(
hostGPUdesc,
prometheus.GaugeValue,
float64(memory.Used),
float64(memoryUsed),
fmt.Sprint(ii), uuid,
)
}
Expand Down
16 changes: 12 additions & 4 deletions pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,18 @@ func (r *NvidiaDevicePlugin) getApiDevices() *[]*api.DeviceInfo {
klog.Errorln("nvml new device by index error idx=", idx, "err=", ret)
panic(0)
}
memoryTotal := 0
memory, ret := ndev.GetMemoryInfo_v2()
if ret != nvml.SUCCESS {
klog.Error("nvml get memory error ret=", ret)
panic(0)
if ret == nvml.SUCCESS {
memoryTotal = int(memory.Total)
} else {
klog.Error("nvml get memory_v2 error ret=", ret)
memory_v1, ret := ndev.GetMemoryInfo()
if ret != nvml.SUCCESS {
klog.Error("nvml get memory_v2 error ret=", ret)
panic(0)
}
memoryTotal = int(memory_v1.Total)
}
UUID, ret := ndev.GetUUID()
if ret != nvml.SUCCESS {
Expand All @@ -122,7 +130,7 @@ func (r *NvidiaDevicePlugin) getApiDevices() *[]*api.DeviceInfo {
panic(0)
}

registeredmem := int32(memory.Total / 1024 / 1024)
registeredmem := int32(memoryTotal / 1024 / 1024)
if *util.DeviceMemoryScaling != 1 {
registeredmem = int32(float64(registeredmem) * *util.DeviceMemoryScaling)
}
Expand Down

0 comments on commit b467e73

Please sign in to comment.