From ab32d21313aa82dd6c88b24c1bfcbaf8a970dcea Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Thu, 4 May 2017 13:04:11 +0800 Subject: [PATCH] libcontainer: add support for Intel RDT/CAT in runc About Intel RDT/CAT feature: Intel platforms with new Xeon CPU support Intel Resource Director Technology (RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which currently supports L3 cache resource allocation. This feature provides a way for the software to restrict cache allocation to a defined 'subset' of L3 cache which may be overlapping with other 'subsets'. The different subsets are identified by class of service (CLOS) and each CLOS has a capacity bitmask (CBM). For more information about Intel RDT/CAT can be found in the section 17.17 of Intel Software Developer Manual. About Intel RDT/CAT kernel interface: In Linux 4.10 kernel or newer, the interface is defined and exposed via "resource control" filesystem, which is a "cgroup-like" interface. Comparing with cgroups, it has similar process management lifecycle and interfaces in a container. But unlike cgroups' hierarchy, it has single level filesystem layout. Intel RDT "resource control" filesystem hierarchy: mount -t resctrl resctrl /sys/fs/resctrl tree /sys/fs/resctrl /sys/fs/resctrl/ |-- info | |-- L3 | |-- cbm_mask | |-- min_cbm_bits | |-- num_closids |-- cpus |-- schemata |-- tasks |-- |-- cpus |-- schemata |-- tasks For runc, we can make use of `tasks` and `schemata` configuration for L3 cache resource constraints. The file `tasks` has a list of tasks that belongs to this group (e.g., " group). Tasks can be added to a group by writing the task ID to the "tasks" file (which will automatically remove them from the previous group to which they belonged). New tasks created by fork(2) and clone(2) are added to the same group as their parent. If a pid is not in any sub group, it Is in root group. The file `schemata` has allocation bitmasks/values for L3 cache on each socket, which contains L3 cache id and capacity bitmask (CBM). Format: "L3:=;=;..." For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. The valid L3 cache CBM is a *contiguous bits set* and number of bits that can be set is less than the max bit. The max bits in the CBM is varied among supported Intel Xeon platforms. In Intel RDT "resource control" filesystem layout, the CBM in a group should be a subset of the CBM in root. Kernel will check if it is valid when writing. e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. For more information about Intel RDT/CAT kernel interface: https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt An example for runc: Consider a two-socket machine with two L3 caches where the default CBM is 0xfffff and the max CBM length is 20 bits. With this configuration, tasks inside the container only have access to the "upper" 80% of L3 cache id 0 and the "lower" 50% L3 cache id 1: "linux": { "intelRdt": { "l3CacheSchema": "L3:0=ffff0;1=3ff" } } Signed-off-by: Xiaochen Shen --- events.go | 25 +- libcontainer/configs/config.go | 4 + libcontainer/configs/intelrdt.go | 7 + libcontainer/configs/validate/validator.go | 17 + libcontainer/container_linux.go | 17 + libcontainer/container_linux_test.go | 83 ++- libcontainer/factory_linux.go | 23 + libcontainer/factory_linux_test.go | 28 +- libcontainer/intelrdt/intelrdt.go | 557 +++++++++++++++++++++ libcontainer/intelrdt/intelrdt_test.go | 43 ++ libcontainer/intelrdt/stats.go | 20 + libcontainer/intelrdt/util_test.go | 67 +++ libcontainer/process_linux.go | 11 + libcontainer/specconv/spec_linux.go | 6 + libcontainer/stats_linux.go | 6 +- utils_linux.go | 5 + 16 files changed, 908 insertions(+), 11 deletions(-) create mode 100644 libcontainer/configs/intelrdt.go create mode 100644 libcontainer/intelrdt/intelrdt.go create mode 100644 libcontainer/intelrdt/intelrdt_test.go create mode 100644 libcontainer/intelrdt/stats.go create mode 100644 libcontainer/intelrdt/util_test.go diff --git a/events.go b/events.go index 6c21e5259f5..e4c6fcf5097 100644 --- a/events.go +++ b/events.go @@ -24,11 +24,12 @@ type event struct { // stats is the runc specific stats structure for stability when encoding and decoding stats. type stats struct { - CPU cpu `json:"cpu"` - Memory memory `json:"memory"` - Pids pids `json:"pids"` - Blkio blkio `json:"blkio"` - Hugetlb map[string]hugetlb `json:"hugetlb"` + CPU cpu `json:"cpu"` + Memory memory `json:"memory"` + Pids pids `json:"pids"` + Blkio blkio `json:"blkio"` + Hugetlb map[string]hugetlb `json:"hugetlb"` + IntelRdt intelRdt `json:"intel_rdt"` } type hugetlb struct { @@ -95,6 +96,12 @@ type memory struct { Raw map[string]uint64 `json:"raw,omitempty"` } +type intelRdt struct { + // The read-only default "schema" in root, for reference + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + var eventsCommand = cli.Command{ Name: "events", Usage: "display container events such as OOM notifications, cpu, memory, and IO usage statistics", @@ -226,6 +233,14 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *stats { for k, v := range cg.HugetlbStats { s.Hugetlb[k] = convertHugtlb(v) } + + is := ls.IntelRdtStats + if is == nil { + return &s + } + s.IntelRdt.L3CacheSchemaRoot = is.IntelRdtRootStats.L3CacheSchema + s.IntelRdt.L3CacheSchema = is.IntelRdtStats.L3CacheSchema + return &s } diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 98f4b8585f3..9262aec62d2 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -186,6 +186,10 @@ type Config struct { // Rootless specifies whether the container is a rootless container. Rootless bool `json:"rootless"` + + // IntelRdt specifies specific settings for Intel RDT/CAT group that the container is + // placed into to limit the resources (e.g., L3 cache) the container has available + IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` } type Hooks struct { diff --git a/libcontainer/configs/intelrdt.go b/libcontainer/configs/intelrdt.go new file mode 100644 index 00000000000..36bd5f96a11 --- /dev/null +++ b/libcontainer/configs/intelrdt.go @@ -0,0 +1,7 @@ +package configs + +type IntelRdt struct { + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 8284345442c..8042d9a1f86 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" selinux "github.com/opencontainers/selinux/go-selinux" ) @@ -40,6 +41,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.sysctl(config); err != nil { return err } + if err := v.intelrdt(config); err != nil { + return err + } if config.Rootless { if err := v.rootless(config); err != nil { return err @@ -153,6 +157,19 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { return nil } +func (v *ConfigValidator) intelrdt(config *configs.Config) error { + if config.IntelRdt != nil { + if !intelrdt.IsIntelRdtEnabled() { + return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported") + } + if config.IntelRdt.L3CacheSchema == "" { + return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + } + } + + return nil +} + func isSymbolicLink(path string) (bool, error) { fi, err := os.Lstat(path) if err != nil { diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 4a7c453f958..467fd0362d8 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -22,6 +22,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/resourcemanager" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" @@ -65,6 +66,9 @@ type State struct { // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore ExternalDescriptors []string `json:"external_descriptors,omitempty"` + + // Intel RDT "resource control" filesystem path + IntelRdtPath string `json:"intel_rdt_path"` } // Container is a libcontainer container object. @@ -163,6 +167,13 @@ func (c *linuxContainer) Stats() (*Stats, error) { if err != nil { return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") } + if intelRdtManager, ok := c.resourceManagers["intelrdt"]; ok == true { + intelRdtStats, err := intelRdtManager.GetStats() + if err != nil { + return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats") + } + stats.IntelRdtStats = intelRdtStats.(*intelrdt.Stats) + } for _, iface := range c.config.Networks { switch iface.Type { case "veth": @@ -444,6 +455,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, return &setnsProcess{ cmd: cmd, cgroupPaths: c.resourceManagers["cgroups"].GetPaths(), + intelRdtPath: state.IntelRdtPath, childPipe: childPipe, parentPipe: parentPipe, config: c.newInitConfig(p), @@ -1418,6 +1430,10 @@ func (c *linuxContainer) currentState() (*State, error) { startTime, _ = c.initProcess.startTime() externalDescriptors = c.initProcess.externalDescriptors() } + intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID()) + if err != nil { + intelRdtPath = "" + } state := &State{ BaseState: BaseState{ ID: c.ID(), @@ -1428,6 +1444,7 @@ func (c *linuxContainer) currentState() (*State, error) { }, Rootless: c.config.Rootless, CgroupPaths: c.resourceManagers["cgroups"].GetPaths(), + IntelRdtPath: intelRdtPath, NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, } diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index 24c58787b4a..ba761177bf3 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -9,6 +9,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/resourcemanager" ) @@ -19,6 +20,13 @@ type mockCgroupManager struct { paths map[string]string } +type mockIntelRdtManager struct { + pids []int + allPids []int + stats *intelrdt.Stats + path string +} + func (m *mockCgroupManager) GetPids() ([]int, error) { return m.pids, nil } @@ -51,6 +59,40 @@ func (m *mockCgroupManager) Freeze(state configs.FreezerState) error { return nil } +func (m *mockIntelRdtManager) GetPids() ([]int, error) { + return m.pids, nil +} + +func (m *mockIntelRdtManager) GetAllPids() ([]int, error) { + return m.allPids, nil +} + +func (m *mockIntelRdtManager) GetStats() (interface{}, error) { + return m.stats, nil +} + +func (m *mockIntelRdtManager) Apply(pid int) error { + return nil +} + +func (m *mockIntelRdtManager) Set(container *configs.Config) error { + return nil +} + +func (m *mockIntelRdtManager) Destroy() error { + return nil +} + +func (m *mockIntelRdtManager) GetPaths() map[string]string { + paths := make(map[string]string) + paths["intelrdt"] = m.path + return paths +} + +func (m *mockIntelRdtManager) Freeze(state configs.FreezerState) error { + return nil +} + type mockProcess struct { _pid int started string @@ -121,6 +163,14 @@ func TestGetContainerStats(t *testing.T) { }, }, } + container.resourceManagers["intelrdt"] = &mockIntelRdtManager{ + pids: []int{1, 2, 3}, + stats: &intelrdt.Stats{ + IntelRdtStats: intelrdt.IntelRdtStats{ + L3CacheSchema: "L3:0=ffff0;1=fff00", + }, + }, + } stats, err := container.Stats() if err != nil { t.Fatal(err) @@ -131,13 +181,22 @@ func TestGetContainerStats(t *testing.T) { if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 { t.Fatalf("expected memory usage 1024 but recevied %d", stats.CgroupStats.MemoryStats.Usage.Usage) } + if intelrdt.IsIntelRdtEnabled() { + if stats.IntelRdtStats == nil { + t.Fatal("intel rdt stats are nil") + } + if stats.IntelRdtStats.IntelRdtStats.L3CacheSchema != "L3:0=ffff0;1=fff00" { + t.Fatalf("expected L3CacheSchema L3:0=ffff0;1=fff00 but recevied %s", stats.IntelRdtStats.IntelRdtStats.L3CacheSchema) + } + } } func TestGetContainerState(t *testing.T) { var ( - pid = os.Getpid() - expectedMemoryPath = "/sys/fs/cgroup/memory/myid" - expectedNetworkPath = "/networks/fd" + pid = os.Getpid() + expectedMemoryPath = "/sys/fs/cgroup/memory/myid" + expectedNetworkPath = "/networks/fd" + expectedIntelRdtPath = "/sys/fs/resctrl/myid" ) container := &linuxContainer{ id: "myid", @@ -170,6 +229,15 @@ func TestGetContainerState(t *testing.T) { "memory": expectedMemoryPath, }, } + container.resourceManagers["intelrdt"] = &mockIntelRdtManager{ + pids: []int{1, 2, 3}, + stats: &intelrdt.Stats{ + IntelRdtStats: intelrdt.IntelRdtStats{ + L3CacheSchema: "L3:0=ffff0;1=fff00", + }, + }, + path: expectedIntelRdtPath, + } container.state = &createdState{c: container} state, err := container.State() if err != nil { @@ -188,6 +256,15 @@ func TestGetContainerState(t *testing.T) { if memPath := paths["memory"]; memPath != expectedMemoryPath { t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath) } + if intelrdt.IsIntelRdtEnabled() { + intelRdtPath := state.IntelRdtPath + if intelRdtPath == "" { + t.Fatal("intel rdt path should not be empty") + } + if intelRdtPath != expectedIntelRdtPath { + t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath) + } + } for _, ns := range container.config.Namespaces { path := state.NamespacePaths[ns.Type] if path == "" { diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 8a769778bb6..ca7913c5252 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -19,6 +19,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/resourcemanager" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -89,6 +90,19 @@ func RootlessCgroups(l *LinuxFactory) error { return nil } +// IntelRdtfs is an options func to configure a LinuxFactory to return +// containers that use the Intel RDT "resource control" filesystem to +// create and manage Intel Xeon platform shared resources (e.g., L3 cache). +func IntelRdtFs(l *LinuxFactory) error { + l.NewIntelRdtManager = func(config *configs.Config, id string) intelrdt.Manager { + return &intelrdt.IntelRdtManager{ + Config: config, + Id: id, + } + } + return nil +} + // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. func TmpfsRoot(l *LinuxFactory) error { mounted, err := mount.Mounted(l.Root) @@ -153,6 +167,9 @@ type LinuxFactory struct { // NewCgroupsManager returns an initialized cgroups manager for a single container. NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager + + // NewIntelRdtManager returns an initialized Intel RDT manager for a single container. + NewIntelRdtManager func(config *configs.Config, id string) intelrdt.Manager } func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { @@ -197,6 +214,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err } resourceManagers := make(map[string]resourcemanager.ResourceManager) resourceManagers["cgroups"] = l.NewCgroupsManager(config.Cgroups, nil) + if intelrdt.IsIntelRdtEnabled() { + resourceManagers["intelrdt"] = l.NewIntelRdtManager(config, id) + } c.resourceManagers = resourceManagers c.state = &stoppedState{c: c} return c, nil @@ -232,6 +252,9 @@ func (l *LinuxFactory) Load(id string) (Container, error) { } resourceManagers := make(map[string]resourcemanager.ResourceManager) resourceManagers["cgroups"] = l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths) + if intelrdt.IsIntelRdtEnabled() { + resourceManagers["intelrdt"] = l.NewIntelRdtManager(&state.Config, id) + } c.resourceManagers = resourceManagers c.state = &loadedState{c: c} if err := c.refreshState(); err != nil { diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go index ea3b5132d77..53593934573 100644 --- a/libcontainer/factory_linux_test.go +++ b/libcontainer/factory_linux_test.go @@ -49,6 +49,32 @@ func TestFactoryNew(t *testing.T) { } } +func TestFactoryNewIntelRdt(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs, IntelRdtFs) + if err != nil { + t.Fatal(err) + } + if factory == nil { + t.Fatal("factory should not be nil") + } + lfactory, ok := factory.(*LinuxFactory) + if !ok { + t.Fatal("expected linux factory returned on linux based systems") + } + if lfactory.Root != root { + t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root) + } + + if factory.Type() != "libcontainer" { + t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") + } +} + func TestFactoryNewTmpfs(t *testing.T) { root, rerr := newTestRoot() if rerr != nil { @@ -163,7 +189,7 @@ func TestFactoryLoadContainer(t *testing.T) { if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil { t.Fatal(err) } - factory, err := New(root, Cgroupfs) + factory, err := New(root, Cgroupfs, IntelRdtFs) if err != nil { t.Fatal(err) } diff --git a/libcontainer/intelrdt/intelrdt.go b/libcontainer/intelrdt/intelrdt.go new file mode 100644 index 00000000000..6b684500a1d --- /dev/null +++ b/libcontainer/intelrdt/intelrdt.go @@ -0,0 +1,557 @@ +// +build linux + +package intelrdt + +import ( + "bufio" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/resourcemanager" +) + +/* + * About Intel RDT/CAT feature: + * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). + * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 + * Cache is the only resource that is supported in RDT. + * + * This feature provides a way for the software to restrict cache allocation to a + * defined 'subset' of L3 cache which may be overlapping with other 'subsets'. + * The different subsets are identified by class of service (CLOS) and each CLOS + * has a capacity bitmask (CBM). + * + * For more information about Intel RDT/CAT can be found in the section 17.17 + * of Intel Software Developer Manual. + * + * About Intel RDT/CAT kernel interface: + * In Linux 4.10 kernel or newer, the interface is defined and exposed via + * "resource control" filesystem, which is a "cgroup-like" interface. + * + * Comparing with cgroups, it has similar process management lifecycle and + * interfaces in a container. But unlike cgroups' hierarchy, it has single level + * filesystem layout. + * + * Intel RDT "resource control" filesystem hierarchy: + * mount -t resctrl resctrl /sys/fs/resctrl + * tree /sys/fs/resctrl + * /sys/fs/resctrl/ + * |-- info + * | |-- L3 + * | |-- cbm_mask + * | |-- min_cbm_bits + * | |-- num_closids + * |-- cpus + * |-- schemata + * |-- tasks + * |-- + * |-- cpus + * |-- schemata + * |-- tasks + * + * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache + * resource constraints. + * + * The file `tasks` has a list of tasks that belongs to this group (e.g., + * " group). Tasks can be added to a group by writing the task ID + * to the "tasks" file (which will automatically remove them from the previous + * group to which they belonged). New tasks created by fork(2) and clone(2) are + * added to the same group as their parent. If a pid is not in any sub group, it is + * in root group. + * + * The file `schemata` has allocation bitmasks/values for L3 cache on each socket, + * which contains L3 cache id and capacity bitmask (CBM). + * Format: "L3:=;=;..." + * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` + * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + * + * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can + * be set is less than the max bit. The max bits in the CBM is varied among + * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem + * layout, the CBM in a group should be a subset of the CBM in root. Kernel will + * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits + * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM + * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * + * For more information about Intel RDT/CAT kernel interface: + * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + * + * An example for runc: + * Consider a two-socket machine with two L3 caches where the default CBM is + * 0xfffff and the max CBM length is 20 bits. With this configuration, tasks + * inside the container only have access to the "upper" 80% of L3 cache id 0 and + * the "lower" 50% L3 cache id 1: + * + * "linux": { + * "intelRdt": { + * "l3CacheSchema": "L3:0=ffff0;1=3ff" + * } + * } + */ + +type Manager interface { + resourcemanager.ResourceManager + + // Returns Intel RDT "resource control" filesystem path to save in + // a state file and to be able to restore the object later + GetPath() string +} + +// This implements interface Manager +type IntelRdtManager struct { + mu sync.Mutex + Config *configs.Config + Id string + Path string +} + +const ( + IntelRdtTasks = "tasks" +) + +var ( + // The absolute root path of the Intel RDT "resource control" filesystem + intelRdtRoot string + intelRdtRootLock sync.Mutex + + // The flag to indicate if Intel RDT is supported + isIntelRdtEnabled bool +) + +// The read-only Intel RDT related system information in root +type IntelRdtInfo struct { + CbmMask uint64 `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type intelRdtData struct { + root string + config *configs.Config + pid int +} + +// Return the mount point path of Intel RDT "resource control" filesysem +func findIntelRdtMountpointDir() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "Intel RDT" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "resctrl" { + // Check that the mount is properly formated. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + return fields[4], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + + return "", NewNotFoundError("Intel RDT") +} + +// Gets the root path of Intel RDT "resource control" filesystem +func getIntelRdtRoot() (string, error) { + intelRdtRootLock.Lock() + defer intelRdtRootLock.Unlock() + + if intelRdtRoot != "" { + return intelRdtRoot, nil + } + + root, err := findIntelRdtMountpointDir() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + intelRdtRoot = root + return intelRdtRoot, nil +} + +func isIntelRdtMounted() bool { + _, err := getIntelRdtRoot() + if err != nil { + return false + } + + return true +} + +func parseCpuInfoFile(path string) (bool, error) { + f, err := os.Open(path) + if err != nil { + return false, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return false, err + } + + text := s.Text() + flags := strings.Split(text, " ") + + // "rdt_a" flag is set if Intel RDT is supported + for _, flag := range flags { + if flag == "rdt_a" { + return true, nil + } + } + } + return false, nil +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Gets a single uint64 value from the specified file. +func getIntelRdtParamUint(path, file string) (uint64, error) { + fileName := filepath.Join(path, file) + contents, err := ioutil.ReadFile(fileName) + if err != nil { + return 0, err + } + + res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName) + } + return res, nil +} + +// Gets a string value from the specified file +func getIntelRdtParamString(path, file string) (string, error) { + contents, err := ioutil.ReadFile(filepath.Join(path, file)) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(contents)), nil +} + +func readTasksFile(dir string) ([]int, error) { + f, err := os.Open(filepath.Join(dir, IntelRdtTasks)) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + +func writeFile(dir, file, data string) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", file) + } + if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + } + return nil +} + +func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + return &intelRdtData{ + root: rootPath, + config: c, + pid: pid, + }, nil +} + +// WriteIntelRdtTasks writes the specified pid into the "tasks" file +func WriteIntelRdtTasks(dir string, pid int) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", IntelRdtTasks) + } + + // Dont attach any pid if -1 is specified as a pid + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) + } + } + return nil +} + +// Check if Intel RDT is enabled +func IsIntelRdtEnabled() bool { + // We have checked the flag before + if isIntelRdtEnabled { + return true + } + + // 1. Check if hardware and kernel support Intel RDT feature + // "rdt_a" flag is set if supported + isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if !isFlagSet || err != nil { + isIntelRdtEnabled = false + return false + } + + // 2. Check if Intel RDT "resource control" filesystem is mounted + // The user guarantees to mount the filesystem + isIntelRdtEnabled = isIntelRdtMounted() + return isIntelRdtEnabled +} + +// Get Intel RDT "resource control" filesystem path +func GetIntelRdtPath(id string) (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, id) + return path, nil +} + +// Get read-only Intel RDT related system information +func GetIntelRdtInfo() (*IntelRdtInfo, error) { + intelRdtInfo := &IntelRdtInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + + path := filepath.Join(rootPath, "info", "l3") + cbmMask, err := getIntelRdtParamUint(path, "cbm_mask") + if err != nil { + return nil, err + } + minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits") + if err != nil { + return nil, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return nil, err + } + + intelRdtInfo.CbmMask = cbmMask + intelRdtInfo.MinCbmBits = minCbmBits + intelRdtInfo.NumClosids = numClosids + + return intelRdtInfo, nil +} + +// Applies configuration to the process with the specified pid +func (m *IntelRdtManager) Apply(pid int) (err error) { + d, err := getIntelRdtData(m.Config, pid) + if err != nil && !IsNotFound(err) { + return err + } + + m.mu.Lock() + defer m.mu.Unlock() + path, err := d.join(m.Id) + if err != nil { + return err + } + + m.Path = path + return nil +} + +// Returns the PIDs inside Intel RDT "resource control" filesystem at path +func (m *IntelRdtManager) GetPids() ([]int, error) { + return readTasksFile(m.GetPath()) +} + +// Returns all the PIDs inside Intel RDT "resource control" filesystem at path +func (m *IntelRdtManager) GetAllPids() ([]int, error) { + return m.GetPids() +} + +// Toggles the freezer cgroup according with specified state +func (m *IntelRdtManager) Freeze(state configs.FreezerState) error { + return nil +} + +// Destroys the Intel RDT "resource control" filesystem +func (m *IntelRdtManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + if err := os.RemoveAll(m.Path); err != nil { + return err + } + m.Path = "" + return nil +} + +// Returns Intel RDT "resource control" filesystem paths to save in +// a state file and to be able to restore the object later +func (m *IntelRdtManager) GetPaths() map[string]string { + m.mu.Lock() + paths := make(map[string]string) + paths["intelrdt"] = m.Path + m.mu.Unlock() + return paths +} + +// Returns Intel RDT "resource control" filesystem path to save in +// a state file and to be able to restore the object later +func (m *IntelRdtManager) GetPath() string { + if m.Path == "" { + m.Path, _ = GetIntelRdtPath(m.Id) + } + return m.Path +} + +// Returns statistics for Intel RDT +func (m *IntelRdtManager) GetStats() (interface{}, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := NewStats() + + // The read-only default "schemata" in root, for reference + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + schemaRoot, err := getIntelRdtParamString(rootPath, "schemata") + if err != nil { + return nil, err + } + stats.IntelRdtRootStats.L3CacheSchema = schemaRoot + + // The stats in "container_id" group + schema, err := getIntelRdtParamString(m.GetPath(), "schemata") + if err != nil { + return nil, err + } + stats.IntelRdtStats.L3CacheSchema = schema + + return stats, nil +} + +// Set Intel RDT "resource control" filesystem as configured. +func (m *IntelRdtManager) Set(container *configs.Config) error { + path := m.GetPath() + + // About L3 cache schema file: + // The schema has allocation masks/values for L3 cache on each socket, + // which contains L3 cache id and capacity bitmask (CBM). + // Format: "L3:=;=;..." + // For example, on a two-socket machine, L3's schema line could be: + // L3:0=ff;1=c0 + // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + // + // About L3 cache CBM validity: + // The valid L3 cache CBM is a *contiguous bits set* and number of + // bits that can be set is less than the max bit. The max bits in the + // CBM is varied among supported Intel Xeon platforms. In Intel RDT + // "resource control" filesystem layout, the CBM in a group should + // be a subset of the CBM in root. Kernel will check if it is valid + // when writing. + // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, + // which mapping to entire L3 cache capacity. Some valid CBM values + // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + if container.IntelRdt != nil { + l3CacheSchema := container.IntelRdt.L3CacheSchema + if l3CacheSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema); err != nil { + return err + } + } + } + + return nil +} + +func (raw *intelRdtData) join(id string) (string, error) { + path := filepath.Join(raw.root, id) + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + + if err := WriteIntelRdtTasks(path, raw.pid); err != nil { + return "", err + } + return path, nil +} + +type NotFoundError struct { + ResourceControl string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl) +} + +func NewNotFoundError(res string) error { + return &NotFoundError{ + ResourceControl: res, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} diff --git a/libcontainer/intelrdt/intelrdt_test.go b/libcontainer/intelrdt/intelrdt_test.go new file mode 100644 index 00000000000..e85748b4a3c --- /dev/null +++ b/libcontainer/intelrdt/intelrdt_test.go @@ -0,0 +1,43 @@ +// +build linux + +package intelrdt + +import ( + "testing" +) + +func TestIntelRdtSetL3CacheSchema(t *testing.T) { + if !IsIntelRdtEnabled() { + return + } + + helper := NewIntelRdtTestUtil(t) + defer helper.cleanup() + + const ( + l3CacheSchemaBefore = "L3:0=f;1=f0" + l3CacheSchemeAfter = "L3:0=f0;1=f" + ) + + helper.writeFileContents(map[string]string{ + "schemata": l3CacheSchemaBefore + "\n", + }) + + helper.IntelRdtData.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter + intelrdt := &IntelRdtManager{ + Config: helper.IntelRdtData.config, + Path: helper.IntelRdtPath, + } + if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + t.Fatal(err) + } + + value, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + + if value != l3CacheSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} diff --git a/libcontainer/intelrdt/stats.go b/libcontainer/intelrdt/stats.go new file mode 100644 index 00000000000..a2412d9cb7f --- /dev/null +++ b/libcontainer/intelrdt/stats.go @@ -0,0 +1,20 @@ +// +build linux + +package intelrdt + +type IntelRdtRootStats struct { + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + +type IntelRdtStats struct { + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + +type Stats struct { + IntelRdtRootStats IntelRdtRootStats `json:"intel_rdt_root_stats,omitempty"` + IntelRdtStats IntelRdtStats `json:"intel_rdt_stats,omitempty"` +} + +func NewStats() *Stats { + return &Stats{} +} diff --git a/libcontainer/intelrdt/util_test.go b/libcontainer/intelrdt/util_test.go new file mode 100644 index 00000000000..970b6ce360e --- /dev/null +++ b/libcontainer/intelrdt/util_test.go @@ -0,0 +1,67 @@ +// +build linux + +/* + * Utility for testing Intel RDT operations. + * Creates a mock of the Intel RDT "resource control" filesystem for the duration of the test. + */ +package intelrdt + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type intelRdtTestUtil struct { + // intelRdt data to use in tests + IntelRdtData *intelRdtData + + // Path to the mock Intel RDT "resource control" filesystem directory + IntelRdtPath string + + // Temporary directory to store mock Intel RDT "resource control" filesystem + tempDir string + t *testing.T +} + +// Creates a new test util +func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil { + d := &intelRdtData{ + config: &configs.Config{ + IntelRdt: &configs.IntelRdt{}, + }, + } + tempDir, err := ioutil.TempDir("", "intelrdt_test") + if err != nil { + t.Fatal(err) + } + d.root = tempDir + testIntelRdtPath := filepath.Join(d.root, "resctrl") + if err != nil { + t.Fatal(err) + } + + // Ensure the full mock Intel RDT "resource control" filesystem path exists + err = os.MkdirAll(testIntelRdtPath, 0755) + if err != nil { + t.Fatal(err) + } + return &intelRdtTestUtil{IntelRdtData: d, IntelRdtPath: testIntelRdtPath, tempDir: tempDir, t: t} +} + +func (c *intelRdtTestUtil) cleanup() { + os.RemoveAll(c.tempDir) +} + +// Write the specified contents on the mock of the specified Intel RDT "resource control" files +func (c *intelRdtTestUtil) writeFileContents(fileContents map[string]string) { + for file, contents := range fileContents { + err := writeFile(c.IntelRdtPath, file, contents) + if err != nil { + c.t.Fatal(err) + } + } +} diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index b06461b9645..8719554d889 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/resourcemanager" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" @@ -48,6 +49,7 @@ type setnsProcess struct { parentPipe *os.File childPipe *os.File cgroupPaths map[string]string + intelRdtPath string config *initConfig fds []string process *Process @@ -87,6 +89,15 @@ func (p *setnsProcess) start() (err error) { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } + if p.intelRdtPath != "" { + // if Intel RDT "resource control" filesystem path exists + _, err := os.Stat(p.intelRdtPath) + if err == nil { + if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid()) + } + } + } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 1575ae03793..eb9598f242a 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -244,6 +244,12 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { createHooks(spec, config) config.MountLabel = spec.Linux.MountLabel config.Version = specs.Version + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + } return config, nil } diff --git a/libcontainer/stats_linux.go b/libcontainer/stats_linux.go index c629dc67de9..29fd641e9dd 100644 --- a/libcontainer/stats_linux.go +++ b/libcontainer/stats_linux.go @@ -1,8 +1,10 @@ package libcontainer import "github.com/opencontainers/runc/libcontainer/cgroups" +import "github.com/opencontainers/runc/libcontainer/intelrdt" type Stats struct { - Interfaces []*NetworkInterface - CgroupStats *cgroups.Stats + Interfaces []*NetworkInterface + CgroupStats *cgroups.Stats + IntelRdtStats *intelrdt.Stats } diff --git a/utils_linux.go b/utils_linux.go index c6a8c028e6a..3e9d55dfa4a 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -16,6 +16,7 @@ import ( "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/specconv" "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" @@ -41,6 +42,10 @@ func loadFactory(context *cli.Context) (libcontainer.Factory, error) { return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available") } } + if intelrdt.IsIntelRdtEnabled() { + intelRdtManager := libcontainer.IntelRdtFs + return libcontainer.New(abs, cgroupManager, intelRdtManager, libcontainer.CriuPath(context.GlobalString("criu"))) + } return libcontainer.New(abs, cgroupManager, libcontainer.CriuPath(context.GlobalString("criu"))) }