Skip to content

Commit

Permalink
Merge pull request google#1973 from dashpole/cherrypick_to_v0.29
Browse files Browse the repository at this point in the history
Cherrypick google#1971, google#1969 google#1964, google#1963 to release v0.29
  • Loading branch information
dashpole authored Jun 21, 2018
2 parents 188659f + 58b59e5 commit d9e88b7
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 44 deletions.
49 changes: 22 additions & 27 deletions accelerators/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ import (
)

type NvidiaManager struct {
sync.RWMutex
sync.Mutex

// true if there are NVIDIA devices present on the node
devicesPresent bool

// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
nvmlInitialized bool
Expand All @@ -51,20 +54,9 @@ func (nm *NvidiaManager) Setup() {
return
}

nm.initializeNVML()
if nm.nvmlInitialized {
return
}
go func() {
glog.V(2).Info("Starting goroutine to initialize NVML")
// TODO: use globalHousekeepingInterval
for range time.Tick(time.Minute) {
nm.initializeNVML()
if nm.nvmlInitialized {
return
}
}
}()
nm.devicesPresent = true

initializeNVML(nm)
}

// detectDevices returns true if a device with given pci id is present on the node.
Expand All @@ -91,20 +83,18 @@ func detectDevices(vendorId string) bool {
}

// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
func (nm *NvidiaManager) initializeNVML() {
// This is defined as a variable to help in testing.
var initializeNVML = func(nm *NvidiaManager) {
if err := gonvml.Initialize(); err != nil {
// This is under a logging level because otherwise we may cause
// log spam if the drivers/nvml is not installed on the system.
glog.V(4).Infof("Could not initialize NVML: %v", err)
return
}
nm.nvmlInitialized = true
numDevices, err := gonvml.DeviceCount()
if err != nil {
glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
nm.Lock()
// Even though we won't have GPU metrics, the library was initialized and should be shutdown when exiting.
nm.nvmlInitialized = true
nm.Unlock()
return
}
glog.V(1).Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
Expand All @@ -122,10 +112,6 @@ func (nm *NvidiaManager) initializeNVML() {
}
nm.nvidiaDevices[int(minorNumber)] = device
}
nm.Lock()
// Doing this at the end to avoid race in accessing nvidiaDevices in GetCollector.
nm.nvmlInitialized = true
nm.Unlock()
}

// Destroy shuts down NVML.
Expand All @@ -139,12 +125,21 @@ func (nm *NvidiaManager) Destroy() {
// present in the devices.list file in the given devicesCgroupPath.
func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
nc := &NvidiaCollector{}
nm.RLock()

if !nm.devicesPresent {
return nc, nil
}
// Makes sure that we don't call initializeNVML() concurrently and
// that we only call initializeNVML() when it's not initialized.
nm.Lock()
if !nm.nvmlInitialized {
initializeNVML(nm)
}
if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
nm.RUnlock()
nm.Unlock()
return nc, nil
}
nm.RUnlock()
nm.Unlock()
nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
if err != nil {
return nc, err
Expand Down
14 changes: 13 additions & 1 deletion accelerators/nvidia_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,32 @@ func TestGetCollector(t *testing.T) {
return []int{2, 3}, nil
}
parseDevicesCgroup = mockParser
originalInitializeNVML := initializeNVML
initializeNVML = func(_ *NvidiaManager) {}
defer func() {
parseDevicesCgroup = originalParser
initializeNVML = originalInitializeNVML
}()

nm := &NvidiaManager{}

// When nvmlInitialized is false, empty collector should be returned.
// When devicesPresent is false, empty collector should be returned.
ac, err := nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok := ac.(*NvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))

// When nvmlInitialized is false, empty collector should be returned.
nm.devicesPresent = true
ac, err = nm.GetCollector("does-not-matter")
assert.Nil(t, err)
assert.NotNil(t, ac)
nc, ok = ac.(*NvidiaCollector)
assert.True(t, ok)
assert.Equal(t, 0, len(nc.Devices))

// When nvidiaDevices is empty, empty collector should be returned.
nm.nvmlInitialized = true
ac, err = nm.GetCollector("does-not-matter")
Expand Down
12 changes: 4 additions & 8 deletions container/crio/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,6 @@ type crioContainerHandler struct {
ipAddress string

ignoreMetrics container.MetricSet

// container restart count
restartCount int
}

var _ container.ContainerHandler = &crioContainerHandler{}
Expand Down Expand Up @@ -175,7 +172,10 @@ func newCrioContainerHandler(
// ignore err and get zero as default, this happens with sandboxes, not sure why...
// kube isn't sending restart count in labels for sandboxes.
restartCount, _ := strconv.Atoi(cInfo.Annotations["io.kubernetes.container.restartCount"])
handler.restartCount = restartCount
// Only adds restartcount label if it's greater than 0
if restartCount > 0 {
handler.labels["restartcount"] = strconv.Itoa(restartCount)
}

handler.ipAddress = cInfo.IP

Expand Down Expand Up @@ -225,10 +225,6 @@ func (self *crioContainerHandler) GetSpec() (info.ContainerSpec, error) {
spec, err := common.GetSpec(self.cgroupPaths, self.machineInfoFactory, self.needNet(), hasFilesystem)

spec.Labels = self.labels
// Only adds restartcount label if it's greater than 0
if self.restartCount > 0 {
spec.Labels["restartcount"] = strconv.Itoa(self.restartCount)
}
spec.Envs = self.envs
spec.Image = self.image

Expand Down
12 changes: 4 additions & 8 deletions container/docker/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,6 @@ type dockerContainerHandler struct {

// zfs watcher
zfsWatcher *zfs.ZfsWatcher

// container restart count
restartCount int
}

var _ container.ContainerHandler = &dockerContainerHandler{}
Expand Down Expand Up @@ -250,7 +247,10 @@ func newDockerContainerHandler(
handler.image = ctnr.Config.Image
handler.networkMode = ctnr.HostConfig.NetworkMode
handler.deviceID = ctnr.GraphDriver.Data["DeviceId"]
handler.restartCount = ctnr.RestartCount
// Only adds restartcount label if it's greater than 0
if ctnr.RestartCount > 0 {
handler.labels["restartcount"] = strconv.Itoa(ctnr.RestartCount)
}

// Obtain the IP address for the contianer.
// If the NetworkMode starts with 'container:' then we need to use the IP address of the container specified.
Expand Down Expand Up @@ -386,10 +386,6 @@ func (self *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {
spec, err := common.GetSpec(self.cgroupPaths, self.machineInfoFactory, self.needNet(), hasFilesystem)

spec.Labels = self.labels
// Only adds restartcount label if it's greater than 0
if self.restartCount > 0 {
spec.Labels["restartcount"] = strconv.Itoa(self.restartCount)
}
spec.Envs = self.envs
spec.Image = self.image
spec.CreationTime = self.creationTime
Expand Down
15 changes: 15 additions & 0 deletions fs/fs.go
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,21 @@ func (self *RealFsInfo) GetDirFsDevice(dir string) (*DeviceInfo, error) {
}

mount, found := self.mounts[dir]
// try the parent dir if not found until we reach the root dir
// this is an issue on btrfs systems where the directory is not
// the subvolume
for !found {
pathdir, _ := filepath.Split(dir)
// break when we reach root
if pathdir == "/" {
break
}
// trim "/" from the new parent path otherwise the next possible
// filepath.Split in the loop will not split the string any further
dir = strings.TrimSuffix(pathdir, "/")
mount, found = self.mounts[dir]
}

if found && mount.Fstype == "btrfs" && mount.Major == 0 && strings.HasPrefix(mount.Source, "/dev/") {
major, minor, err := getBtrfsMajorMinorIds(mount)
if err != nil {
Expand Down
5 changes: 5 additions & 0 deletions manager/watcher/raw/raw.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ func (self *rawContainerWatcher) Stop() error {
// Watches the specified directory and all subdirectories. Returns whether the path was
// already being watched and an error (if any).
func (self *rawContainerWatcher) watchDirectory(dir string, containerName string) (bool, error) {
// Don't watch .mount cgroups because they never have containers as sub-cgroups. A single container
// can have many .mount cgroups associated with it which can quickly exhaust the inotify watches on a node.
if strings.HasSuffix(containerName, ".mount") {
return false, nil
}
alreadyWatching, err := self.watcher.AddWatch(containerName, dir)
if err != nil {
return alreadyWatching, err
Expand Down

0 comments on commit d9e88b7

Please sign in to comment.