diff --git a/CHANGELOG.md b/CHANGELOG.md index 7452f70faf0..70456291713 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased 1.1.z] +## [1.1.12] - 2024-01-31 + +> Now you're thinking with Portals™! + +### Security + +* Fix [CVE-2024-21626][cve-2024-21626], a container breakout attack that took + advantage of a file descriptor that was leaked internally within runc (but + never leaked to the container process). In addition to fixing the leak, + several strict hardening measures were added to ensure that future internal + leaks could not be used to break out in this manner again. Based on our + research, while no other container runtime had a similar leak, none had any + of the hardening steps we've introduced (and some runtimes would not check + for any file descriptors that a calling process may have leaked to them, + allowing for container breakouts due to basic user error). + +[cve-2024-21626]: https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv + ## [1.1.11] - 2024-01-01 > Happy New Year! @@ -493,7 +511,8 @@ implementation (libcontainer) is *not* covered by this policy. [1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1 -[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.11...release-1.1 +[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.12...release-1.1 +[1.1.12]: https://github.com/opencontainers/runc/compare/v1.1.11...v1.1.12 [1.1.11]: https://github.com/opencontainers/runc/compare/v1.1.10...v1.1.11 [1.1.10]: https://github.com/opencontainers/runc/compare/v1.1.9...v1.1.10 [1.1.9]: https://github.com/opencontainers/runc/compare/v1.1.8...v1.1.9 diff --git a/VERSION b/VERSION index 5aafe9bab6b..9e5f80d9e8b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.11+dev +1.1.12+dev diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go index 48b263a1661..f6e1b73bd92 100644 --- a/libcontainer/cgroups/file.go +++ b/libcontainer/cgroups/file.go @@ -77,16 +77,16 @@ var ( // TestMode is set to true by unit tests that need "fake" cgroupfs. TestMode bool - cgroupFd int = -1 - prepOnce sync.Once - prepErr error - resolveFlags uint64 + cgroupRootHandle *os.File + prepOnce sync.Once + prepErr error + resolveFlags uint64 ) func prepareOpenat2() error { prepOnce.Do(func() { fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ - Flags: unix.O_DIRECTORY | unix.O_PATH, + Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC, }) if err != nil { prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} @@ -97,15 +97,16 @@ func prepareOpenat2() error { } return } + file := os.NewFile(uintptr(fd), cgroupfsDir) + var st unix.Statfs_t - if err = unix.Fstatfs(fd, &st); err != nil { + if err := unix.Fstatfs(int(file.Fd()), &st); err != nil { prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} logrus.Warnf("falling back to securejoin: %s", prepErr) return } - cgroupFd = fd - + cgroupRootHandle = file resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS if st.Type == unix.CGROUP2_SUPER_MAGIC { // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks @@ -132,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) { return openFallback(path, flags, mode) } - fd, err := unix.Openat2(cgroupFd, relPath, + fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath, &unix.OpenHow{ Resolve: resolveFlags, Flags: uint64(flags) | unix.O_CLOEXEC, @@ -140,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) { }) if err != nil { err = &os.PathError{Op: "openat2", Path: path, Err: err} - // Check if cgroupFd is still opened to cgroupfsDir + // Check if cgroupRootHandle is still opened to cgroupfsDir // (happens when this package is incorrectly used // across the chroot/pivot_root/mntns boundary, or // when /sys/fs/cgroup is remounted). // // TODO: if such usage will ever be common, amend this - // to reopen cgroupFd and retry openat2. - fdStr := strconv.Itoa(cgroupFd) + // to reopen cgroupRootHandle and retry openat2. + fdStr := strconv.Itoa(int(cgroupRootHandle.Fd())) fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) if fdDest != cgroupfsDir { - // Wrap the error so it is clear that cgroupFd + // Wrap the error so it is clear that cgroupRootHandle // is opened to an unexpected/wrong directory. - err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", - fdStr, fdDest, cgroupfsDir, err) + err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w", + cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err) } return nil, err } diff --git a/libcontainer/cgroups/fs/paths.go b/libcontainer/cgroups/fs/paths.go index 1092331b25d..2cb970a3d55 100644 --- a/libcontainer/cgroups/fs/paths.go +++ b/libcontainer/cgroups/fs/paths.go @@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string { if err != nil { return "" } + defer dir.Close() names, err := dir.Readdirnames(1) if err != nil { return "" diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 59aa0338ac6..40b332f9810 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -353,6 +353,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) { }() } + // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC + // to make sure we don't leak any files into "runc init". Any files to be + // passed to "runc init" through ExtraFiles will get dup2'd by the Go + // runtime and thus their O_CLOEXEC flag will be cleared. This is some + // additional protection against attacks like CVE-2024-21626, by making + // sure we never leak files to "runc init" we didn't intend to. + if err := utils.CloseExecFrom(3); err != nil { + return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) + } if err := parent.start(); err != nil { return fmt.Errorf("unable to start container process: %w", err) } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 5b88c71fc83..d9f18139f54 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -8,6 +8,7 @@ import ( "io" "net" "os" + "path/filepath" "strings" "unsafe" @@ -135,6 +136,32 @@ func populateProcessEnvironment(env []string) error { return nil } +// verifyCwd ensures that the current directory is actually inside the mount +// namespace root of the current process. +func verifyCwd() error { + // getcwd(2) on Linux detects if cwd is outside of the rootfs of the + // current mount namespace root, and in that case prefixes "(unreachable)" + // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect + // when this happens and return ENOENT rather than returning a non-absolute + // path. In both cases we can therefore easily detect if we have an invalid + // cwd by checking the return value of getcwd(3). See getcwd(3) for more + // details, and CVE-2024-21626 for the security issue that motivated this + // check. + // + // We have to use unix.Getwd() here because os.Getwd() has a workaround for + // $PWD which involves doing stat(.), which can fail if the current + // directory is inaccessible to the container process. + if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { + return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") + } else if err != nil { + return fmt.Errorf("failed to verify if current working directory is safe: %w", err) + } else if !filepath.IsAbs(wd) { + // We shouldn't ever hit this, but check just in case. + return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) + } + return nil +} + // finalizeNamespace drops the caps, sets the correct user // and working dir, and closes any leaked file descriptors // before executing the command inside the namespace @@ -193,6 +220,10 @@ func finalizeNamespace(config *initConfig) error { return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err) } } + // Make sure our final working directory is inside the container. + if err := verifyCwd(); err != nil { + return err + } if err := system.ClearKeepCaps(); err != nil { return fmt.Errorf("unable to clear keep caps: %w", err) } diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go index 31092a0a5d2..ecdfa7957df 100644 --- a/libcontainer/integration/seccomp_test.go +++ b/libcontainer/integration/seccomp_test.go @@ -13,7 +13,7 @@ import ( libseccomp "github.com/seccomp/libseccomp-golang" ) -func TestSeccompDenyGetcwdWithErrno(t *testing.T) { +func TestSeccompDenySyslogWithErrno(t *testing.T) { if testing.Short() { return } @@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ { - Name: "getcwd", + Name: "syslog", Action: configs.Errno, ErrnoRet: &errnoRet, }, @@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { buffers := newStdBuffers() pwd := &libcontainer.Process{ Cwd: "/", - Args: []string{"pwd"}, + Args: []string{"dmesg"}, Env: standardEnvironment, Stdin: buffers.Stdin, Stdout: buffers.Stdout, @@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { } if exitCode == 0 { - t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode) } - expected := "pwd: getcwd: No such process" + expected := "dmesg: klogctl: No such process" actual := strings.Trim(buffers.Stderr.String(), "\n") if actual != expected { t.Fatalf("Expected output %s but got %s\n", expected, actual) } } -func TestSeccompDenyGetcwd(t *testing.T) { +func TestSeccompDenySyslog(t *testing.T) { if testing.Short() { return } @@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) { DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ { - Name: "getcwd", + Name: "syslog", Action: configs.Errno, }, }, @@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) { buffers := newStdBuffers() pwd := &libcontainer.Process{ Cwd: "/", - Args: []string{"pwd"}, + Args: []string{"dmesg"}, Env: standardEnvironment, Stdin: buffers.Stdin, Stdout: buffers.Stdout, @@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) { } if exitCode == 0 { - t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode) } - expected := "pwd: getcwd: Operation not permitted" + expected := "dmesg: klogctl: Operation not permitted" actual := strings.Trim(buffers.Stderr.String(), "\n") if actual != expected { t.Fatalf("Expected output %s but got %s\n", expected, actual) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 09ab552b3d1..d1bb12273c0 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "os" + "os/exec" "strconv" "github.com/opencontainers/selinux/go-selinux" @@ -14,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) // linuxSetnsInit performs the container's initialization for running a new process @@ -82,6 +84,21 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } + + // Check for the arg before waiting to make sure it exists and it is + // returned as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // exec.LookPath in Go < 1.20 might return no error for an executable + // residing on a file system mounted with noexec flag, so perform this + // extra check now while we can still return a proper error. + // TODO: remove this once go < 1.20 is not supported. + if err := eaccess(name); err != nil { + return &os.PathError{Op: "eaccess", Path: name, Err: err} + } + // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). @@ -101,5 +118,23 @@ func (l *linuxSetnsInit) Init() error { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } - return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } + return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index c09a7bed30e..d1d94352f93 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -17,6 +17,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -258,5 +259,23 @@ func (l *linuxStandardInit) Init() error { return err } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 220d0b43937..bf3237a2911 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "strconv" + _ "unsafe" // for go:linkname "golang.org/x/sys/unix" ) @@ -23,9 +24,11 @@ func EnsureProcHandle(fh *os.File) error { return nil } -// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for -// the process (except for those below the given fd value). -func CloseExecFrom(minFd int) error { +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { fdDir, err := os.Open("/proc/self/fd") if err != nil { return err @@ -50,15 +53,60 @@ func CloseExecFrom(minFd int) error { if fd < minFd { continue } - // Intentionally ignore errors from unix.CloseOnExec -- the cases where - // this might fail are basically file descriptors that have already - // been closed (including and especially the one that was created when - // os.ReadDir did the "opendir" syscall). - unix.CloseOnExec(fd) + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) } return nil } +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We must not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + // NewSockPair returns a new unix socket pair func NewSockPair(name string) (parent *os.File, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) diff --git a/update.go b/update.go index 9ce5a2e835b..6d582ddddec 100644 --- a/update.go +++ b/update.go @@ -174,6 +174,7 @@ other options are ignored. if err != nil { return err } + defer f.Close() } err = json.NewDecoder(f).Decode(&r) if err != nil {