Skip to content

Commit

Permalink
Initialize NVML on demand.
Browse files Browse the repository at this point in the history
Earlier if the NVIDIA driver was not installed when cAdvisor was started
we would start a goroutine to try to initialize NVML every minute.

This resulted in a race. We can have a situation where:
- goroutine tries to initialize NVML but fails. So, it sleeps for a minute.
- the driver is installed.
- a container that uses NVIDIA devices is started.
This container would not get GPU stats because a minute has not passed
since the last failed initialization attempt and so NVML is not
initialized.
  • Loading branch information
rohitagarwal003 committed Jun 18, 2018
1 parent f834c0f commit 2ce4161
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 28 deletions.
49 changes: 22 additions & 27 deletions accelerators/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ import (
)

type NvidiaManager struct {
sync.RWMutex
sync.Mutex

// true if there are NVIDIA devices present on the node
devicesPresent bool

// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
nvmlInitialized bool
Expand All @@ -51,20 +54,9 @@ func (nm *NvidiaManager) Setup() {
return
}

nm.initializeNVML()
if nm.nvmlInitialized {
return
}
go func() {
glog.V(2).Info("Starting goroutine to initialize NVML")
// TODO: use globalHousekeepingInterval
for range time.Tick(time.Minute) {
nm.initializeNVML()
if nm.nvmlInitialized {
return
}
}
}()
nm.devicesPresent = true

initializeNVML(nm)
}

// detectDevices returns true if a device with given pci id is present on the node.
Expand All @@ -91,20 +83,18 @@ func detectDevices(vendorId string) bool {
}

// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
func (nm *NvidiaManager) initializeNVML() {
// This is defined as a variable to help in testing.
var initializeNVML = func(nm *NvidiaManager) {
if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system.
glog.V(4).Infof("Could not initialize NVML: %v", err)
return
}
nm.nvmlInitialized = true
numDevices, err := gonvml.DeviceCount()
if err != nil {
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
nm.Lock()
// Even though we won't have GPU metrics, the library was initialized and should be shutdown when exiting.
nm.nvmlInitialized = true
nm.Unlock()
return
}
glog.V(1).Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
Expand All @@ -122,10 +112,6 @@ func (nm *NvidiaManager) initializeNVML() {
}
nm.nvidiaDevices[int(minorNumber)] = device
}
nm.Lock()
// Doing this at the end to avoid race in accessing nvidiaDevices in GetCollector.
nm.nvmlInitialized = true
nm.Unlock()
}

// Destroy shuts down NVML.
Expand All @@ -139,12 +125,21 @@ func (nm *NvidiaManager) Destroy() {
// present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
nc := &NvidiaCollector{}
nm.RLock()

if !nm.devicesPresent {
return nc, nil
}
// Makes sure that we don't call initializeNVML() concurrently and
// that we only call initializeNVML() when it's not initialized.
nm.Lock()
if !nm.nvmlInitialized {
initializeNVML(nm)
}
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
nm.RUnlock()
nm.Unlock()
return nc, nil
}
nm.RUnlock()
nm.Unlock()
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
if err != nil {
return nc, err
Expand Down
14 changes: 13 additions & 1 deletion accelerators/nvidia_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,32 @@ func TestGetCollector(t *testing.T) {
return []int{2, 3}, nil
}
parseDevicesCgroup = mockParser
originalInitializeNVML := initializeNVML
initializeNVML = func(_ *NvidiaManager) {}
defer func() {
parseDevicesCgroup = originalParser
initializeNVML = originalInitializeNVML
}()

nm := &NvidiaManager{}

// When nvmlInitialized is false, empty collector should be returned.
// When devicesPresent is false, empty collector should be returned.
ac, err := nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok := ac.(*NvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))

// When nvmlInitialized is false, empty collector should be returned.
nm.devicesPresent = true
ac, err = nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok = ac.(*NvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))

// When nvidiaDevices is empty, empty collector should be returned.
nm.nvmlInitialized = true
ac, err = nm.GetCollector("does-not-matter")
Expand Down

0 comments on commit 2ce4161

Please sign in to comment.