Skip to content

Commit

Permalink
Merge pull request #2574 from RenaudWasTaken/nvidia
Browse files Browse the repository at this point in the history
Return a NoopManager if metricset does not container the accelerator value
  • Loading branch information
dashpole authored Jun 9, 2020
2 parents cdaec26 + 4bc592b commit 196b510
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 11 deletions.
24 changes: 15 additions & 9 deletions accelerators/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"sync"
"time"

"github.com/google/cadvisor/container"
info "github.com/google/cadvisor/info/v1"
"github.com/google/cadvisor/stats"

Expand All @@ -48,18 +49,23 @@ var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"

const nvidiaVendorID = "0x10de"

func NewNvidiaManager() stats.Manager {
func NewNvidiaManager(includedMetrics container.MetricSet) stats.Manager {
if !includedMetrics.Has(container.AcceleratorUsageMetrics) {
klog.V(2).Info("NVIDIA GPU metrics disabled")
return &stats.NoopManager{}
}

manager := &nvidiaManager{}
err := manager.setup()
if err != nil {
klog.Warningf("NVidia GPU metrics will not be available: %s", err)
klog.Warningf("NVIDIA GPU metrics will not be available: %s", err)
manager.Destroy()
return &stats.NoopManager{}
}
return manager
}

// setup initializes NVML if nvidia devices are present on the node.
// setup initializes NVML if NVIDIA devices are present on the node.
func (nm *nvidiaManager) setup() error {
if !detectDevices(nvidiaVendorID) {
return fmt.Errorf("no NVIDIA devices found")
Expand Down Expand Up @@ -104,21 +110,21 @@ var initializeNVML = func(nm *nvidiaManager) error {
nm.nvmlInitialized = true
numDevices, err := gonvml.DeviceCount()
if err != nil {
return fmt.Errorf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
return fmt.Errorf("GPU metrics would not be available. Failed to get the number of NVIDIA devices: %v", err)
}
if numDevices == 0 {
return nil
}
klog.V(1).Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
klog.V(1).Infof("NVML initialized. Number of NVIDIA devices: %v", numDevices)
nm.nvidiaDevices = make(map[int]gonvml.Device, numDevices)
for i := 0; i < int(numDevices); i++ {
device, err := gonvml.DeviceHandleByIndex(uint(i))
if err != nil {
return fmt.Errorf("Failed to get nvidia device handle %d: %v", i, err)
return fmt.Errorf("Failed to get NVIDIA device handle %d: %v", i, err)
}
minorNumber, err := device.MinorNumber()
if err != nil {
return fmt.Errorf("Failed to get nvidia device minor number: %v", err)
return fmt.Errorf("Failed to get NVIDIA device minor number: %v", err)
}
nm.nvidiaDevices[int(minorNumber)] = device
}
Expand All @@ -135,7 +141,7 @@ func (nm *nvidiaManager) Destroy() {
}
}

// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
// GetCollector returns a collector that can fetch NVIDIA gpu metrics for NVIDIA devices
// present in the devices.list file in the given devicesCgroupPath.
func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector, error) {
nc := &nvidiaCollector{}
Expand Down Expand Up @@ -165,7 +171,7 @@ func (nm *nvidiaManager) GetCollector(devicesCgroupPath string) (stats.Collector
for _, minor := range nvidiaMinorNumbers {
device, ok := nm.nvidiaDevices[minor]
if !ok {
return &stats.NoopCollector{}, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor)
return &stats.NoopCollector{}, fmt.Errorf("NVIDIA device minor number %d not found in cached devices", minor)
}
nc.devices = append(nc.devices, device)
}
Expand Down
3 changes: 2 additions & 1 deletion cmd/cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ var (

// List of metrics that can be ignored.
ignoreWhitelist = container.MetricSet{
container.AcceleratorUsageMetrics: struct{}{},
container.DiskUsageMetrics: struct{}{},
container.DiskIOMetrics: struct{}{},
container.NetworkUsageMetrics: struct{}{},
Expand Down Expand Up @@ -136,7 +137,7 @@ func (ml *metricSetValue) Set(value string) error {
}

func init() {
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'cpu_topology','disk', 'diskIO', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory'.")
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'accelerator', 'cpu_topology','disk', 'diskIO', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory'.")

// Default logging verbosity to V(2)
flag.Set("v", "2")
Expand Down
2 changes: 1 addition & 1 deletion manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
containerWatchers: []watcher.ContainerWatcher{},
eventsChannel: eventsChannel,
collectorHTTPClient: collectorHTTPClient,
nvidiaManager: accelerators.NewNvidiaManager(),
nvidiaManager: accelerators.NewNvidiaManager(includedMetricsSet),
rawContainerCgroupPathPrefixWhiteList: rawContainerCgroupPathPrefixWhiteList,
}

Expand Down

0 comments on commit 196b510

Please sign in to comment.