Skip to content

Commit

Permalink
Merge pull request google#2579 from iwankgb/skip_offline_cpus
Browse files Browse the repository at this point in the history
Fixed topology with offline CPUs on x86
  • Loading branch information
dashpole authored Jun 11, 2020
2 parents bb5fbc9 + 29a53c9 commit 4059641
Show file tree
Hide file tree
Showing 14 changed files with 261 additions and 4 deletions.
12 changes: 12 additions & 0 deletions machine/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package machine

import (
"bytes"
"fmt"
"io/ioutil"
"os"
Expand Down Expand Up @@ -246,6 +247,17 @@ func getUniqueCPUPropertyCount(cpuBusPath string, propertyName string) int {
}
uniques := make(map[string]bool)
for _, sysCPUPath := range sysCPUPaths {
onlinePath := filepath.Join(sysCPUPath, "online")
onlineVal, err := ioutil.ReadFile(onlinePath)
if err != nil {
klog.Warningf("Cannot determine CPU %s online state, skipping", sysCPUPath)
continue
}
onlineVal = bytes.TrimSpace(onlineVal)
if len(onlineVal) == 0 || onlineVal[0] != 49 {
klog.Warningf("CPU %s is offline, skipping", sysCPUPath)
continue
}
propertyPath := filepath.Join(sysCPUPath, sysFsCPUTopology, propertyName)
propertyVal, err := ioutil.ReadFile(propertyPath)
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions machine/testdata/cpu0/online
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1
Empty file added machine/testdata/cpu1/.gitkeep
Empty file.
1 change: 1 addition & 0 deletions machine/testdata/cpu2/online
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
1 change: 1 addition & 0 deletions machine/testdata/cpu9999/online
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1
14 changes: 14 additions & 0 deletions utils/sysfs/fakesysfs/fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ type FakeSysFs struct {

hugePagesNr map[string]string
hugePagesNrErr error

onlineCPUs map[string]interface{}
}

func (fs *FakeSysFs) GetNodesPaths() ([]string, error) {
Expand Down Expand Up @@ -198,3 +200,15 @@ func (fs *FakeSysFs) SetEntryName(name string) {
func (fs *FakeSysFs) GetSystemUUID() (string, error) {
return "1F862619-BA9F-4526-8F85-ECEAF0C97430", nil
}

func (fs *FakeSysFs) IsCPUOnline(dir string) bool {
if fs.onlineCPUs == nil {
return true
}
_, ok := fs.onlineCPUs[dir]
return ok
}

func (fs *FakeSysFs) SetOnlineCPUs(online map[string]interface{}) {
fs.onlineCPUs = online
}
31 changes: 31 additions & 0 deletions utils/sysfs/sysfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,18 @@
package sysfs

import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"os"
"path"
"path/filepath"
"regexp"
"strconv"
"strings"

"k8s.io/klog/v2"
)

const (
Expand Down Expand Up @@ -95,6 +100,9 @@ type SysFs interface {
GetCacheInfo(cpu int, cache string) (CacheInfo, error)

GetSystemUUID() (string, error)
// IsCPUOnline determines if CPU status from kernel hotplug machanism standpoint.
// See: https://www.kernel.org/doc/html/latest/core-api/cpu_hotplug.html
IsCPUOnline(dir string) bool
}

type realSysFs struct{}
Expand Down Expand Up @@ -326,3 +334,26 @@ func (fs *realSysFs) GetSystemUUID() (string, error) {
return "", err
}
}

func (fs *realSysFs) IsCPUOnline(dir string) bool {
cpuPath := fmt.Sprintf("%s/online", dir)
content, err := ioutil.ReadFile(cpuPath)
if err != nil {
pathErr, ok := err.(*os.PathError)
if ok {
if errors.Is(pathErr.Unwrap(), os.ErrNotExist) && isZeroCPU(dir) {
return true
}
}
klog.Warningf("unable to read %s: %s", cpuPath, err.Error())
return false
}
trimmed := bytes.TrimSpace(content)
return len(trimmed) == 1 && trimmed[0] == 49
}

func isZeroCPU(dir string) bool {
regex := regexp.MustCompile("cpu([0-9]*)")
matches := regex.FindStringSubmatch(dir)
return len(matches) == 2 && matches[1] == "0"
}
21 changes: 20 additions & 1 deletion utils/sysfs/sysfs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
package sysfs

import (
"github.com/stretchr/testify/assert"
"os"
"strconv"
"testing"

"github.com/stretchr/testify/assert"
)

func TestGetNodes(t *testing.T) {
Expand Down Expand Up @@ -122,3 +123,21 @@ func TestGetHugePagesNrWhenFileIsMissing(t *testing.T) {
assert.NotNil(t, err)
assert.Equal(t, "", rawHugePageNr)
}

func TestIsCPUOnline(t *testing.T) {
sysFs := NewRealSysFs()
online := sysFs.IsCPUOnline("./testdata/node0/cpu0")
assert.True(t, online)

online = sysFs.IsCPUOnline("./testdata/node0/cpu1")
assert.False(t, online)
}

func TestIsCPUOnlineNoFileAndCPU0MustBeOnline(t *testing.T) {
sysFs := NewRealSysFs()
online := sysFs.IsCPUOnline("./testdata/missing_online/node0/cpu0")
assert.True(t, online)

online = sysFs.IsCPUOnline("./testdata/missing_online/node0/cpu33")
assert.False(t, online)
}
Empty file.
Empty file.
1 change: 1 addition & 0 deletions utils/sysfs/testdata/node0/cpu0/online
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1
1 change: 1 addition & 0 deletions utils/sysfs/testdata/node0/cpu1/online
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
8 changes: 6 additions & 2 deletions utils/sysinfo/sysinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,11 @@ func GetNodesInfo(sysFs sysfs.SysFs) ([]info.Node, int, error) {
return nil, 0, err
}
node.Cores = cores
for _, core := range cores {
allLogicalCoresCount += len(core.Threads)
}
}

allLogicalCoresCount += len(cpuDirs)

// On some Linux platforms(such as Arm64 guest kernel), cache info may not exist.
// So, we should ignore error here.
err = addCacheInfo(sysFs, &node)
Expand Down Expand Up @@ -384,6 +385,9 @@ func getCoresInfo(sysFs sysfs.SysFs, cpuDirs []string) ([]info.Core, error) {
if err != nil {
return nil, fmt.Errorf("Unexpected format of CPU directory, cpuDirRegExp %s, cpuDir: %s", cpuDirRegExp, cpuDir)
}
if !sysFs.IsCPUOnline(cpuDir) {
continue
}

rawPhysicalID, err := sysFs.GetCoreID(cpuDir)
if os.IsNotExist(err) {
Expand Down
174 changes: 173 additions & 1 deletion utils/sysinfo/sysinfo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,136 @@ func TestGetNodesInfo(t *testing.T) {
assert.JSONEq(t, expectedNodes, string(nodesJSON))
}

func TestGetNodesInfoWithOfflineCPUs(t *testing.T) {
fakeSys := &fakesysfs.FakeSysFs{}
c := sysfs.CacheInfo{
Size: 32 * 1024,
Type: "unified",
Level: 3,
Cpus: 1,
}
fakeSys.SetCacheInfo(c)

nodesPaths := []string{
"/fakeSysfs/devices/system/node/node0",
"/fakeSysfs/devices/system/node/node1",
}
fakeSys.SetNodesPaths(nodesPaths, nil)

cpusPaths := map[string][]string{
"/fakeSysfs/devices/system/node/node0": {
"/fakeSysfs/devices/system/node/node0/cpu0",
"/fakeSysfs/devices/system/node/node0/cpu1",
},
"/fakeSysfs/devices/system/node/node1": {
"/fakeSysfs/devices/system/node/node0/cpu2",
"/fakeSysfs/devices/system/node/node0/cpu3",
},
}
fakeSys.SetCPUsPaths(cpusPaths, nil)

coreThread := map[string]string{
"/fakeSysfs/devices/system/node/node0/cpu0": "0",
"/fakeSysfs/devices/system/node/node0/cpu1": "0",
"/fakeSysfs/devices/system/node/node0/cpu2": "1",
"/fakeSysfs/devices/system/node/node0/cpu3": "1",
}
fakeSys.SetCoreThreads(coreThread, nil)
fakeSys.SetOnlineCPUs(map[string]interface{}{
"/fakeSysfs/devices/system/node/node0/cpu0": nil,
"/fakeSysfs/devices/system/node/node0/cpu2": nil,
})

memTotal := "MemTotal: 32817192 kB"
fakeSys.SetMemory(memTotal, nil)

hugePages := []os.FileInfo{
&fakesysfs.FileInfo{EntryName: "hugepages-2048kB"},
}
fakeSys.SetHugePages(hugePages, nil)

hugePageNr := map[string]string{
"/fakeSysfs/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages": "1",
"/fakeSysfs/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages": "1",
}
fakeSys.SetHugePagesNr(hugePageNr, nil)

physicalPackageIDs := map[string]string{
"/fakeSysfs/devices/system/node/node0/cpu0": "0",
"/fakeSysfs/devices/system/node/node0/cpu1": "0",
"/fakeSysfs/devices/system/node/node0/cpu2": "1",
"/fakeSysfs/devices/system/node/node0/cpu3": "1",
}
fakeSys.SetPhysicalPackageIDs(physicalPackageIDs, nil)

nodes, cores, err := GetNodesInfo(fakeSys)
assert.Nil(t, err)
assert.Equal(t, 2, len(nodes))
assert.Equal(t, 2, cores)

nodesJSON, err := json.Marshal(nodes)
assert.Nil(t, err)
expectedNodes := `
[
{
"node_id": 0,
"memory": 33604804608,
"hugepages": [
{
"page_size": 2048,
"num_pages": 1
}
],
"cores": [
{
"core_id": 0,
"thread_ids": [
0
],
"caches": null,
"socket_id": 0
}
],
"caches": [
{
"size": 32768,
"type": "unified",
"level": 3
}
]
},
{
"node_id": 1,
"memory": 33604804608,
"hugepages": [
{
"page_size": 2048,
"num_pages": 1
}
],
"cores": [
{
"core_id": 1,
"thread_ids": [
2
],
"caches": null,
"socket_id": 1
}
],
"caches": [
{
"size": 32768,
"type": "unified",
"level": 3
}
]
}
]
`
assert.JSONEq(t, expectedNodes, string(nodesJSON))
}

func TestGetNodesWithoutMemoryInfo(t *testing.T) {
fakeSys := &fakesysfs.FakeSysFs{}
c := sysfs.CacheInfo{
Expand Down Expand Up @@ -775,7 +905,7 @@ func TestGetNodesWhenTopologyDirMissingForOneCPU(t *testing.T) {
assert.Nil(t, err)

assert.Equal(t, 1, len(nodes))
assert.Equal(t, 2, cores)
assert.Equal(t, 1, cores)

sort.Slice(nodes, func(i, j int) bool {
return nodes[i].Id < nodes[j].Id
Expand Down Expand Up @@ -955,6 +1085,48 @@ func TestGetCoresInfoWhenCoreIDIsNotDigit(t *testing.T) {
assert.Equal(t, []info.Core(nil), cores)
}

func TestGetCoresInfoWithOnlineOfflineFile(t *testing.T) {
sysFs := &fakesysfs.FakeSysFs{}
nodesPaths := []string{
"/fakeSysfs/devices/system/node/node0",
}
sysFs.SetNodesPaths(nodesPaths, nil)

cpusPaths := map[string][]string{
"/fakeSysfs/devices/system/node/node0": {
"/fakeSysfs/devices/system/node/node0/cpu0",
"/fakeSysfs/devices/system/node/node0/cpu1",
},
}
sysFs.SetCPUsPaths(cpusPaths, nil)

coreThread := map[string]string{
"/fakeSysfs/devices/system/node/node0/cpu0": "0",
"/fakeSysfs/devices/system/node/node0/cpu1": "0",
}
sysFs.SetCoreThreads(coreThread, nil)
sysFs.SetOnlineCPUs(map[string]interface{}{"/fakeSysfs/devices/system/node/node0/cpu0": nil})
sysFs.SetPhysicalPackageIDs(map[string]string{
"/fakeSysfs/devices/system/node/node0/cpu0": "0",
"/fakeSysfs/devices/system/node/node0/cpu1": "0",
}, nil)

cores, err := getCoresInfo(
sysFs,
[]string{"/fakeSysfs/devices/system/node/node0/cpu0", "/fakeSysfs/devices/system/node/node0/cpu1"},
)
assert.NoError(t, err)
expected := []info.Core{
{
Id: 0,
Threads: []int{0},
Caches: nil,
SocketID: 0,
},
}
assert.Equal(t, expected, cores)
}

func TestGetBlockDeviceInfo(t *testing.T) {
fakeSys := fakesysfs.FakeSysFs{}
disks, err := GetBlockDeviceInfo(&fakeSys)
Expand Down

0 comments on commit 4059641

Please sign in to comment.