From 506552a88bd3455e80a9b3829568e94ec0160309 Mon Sep 17 00:00:00 2001 From: "hang.jiang" Date: Fri, 1 Sep 2023 16:17:13 +0800 Subject: [PATCH 1/9] Fix File to Close (This is a cherry-pick of 937ca107c3d22da77eb8e8030f2342253b980980.) Signed-off-by: hang.jiang Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/fs/paths.go | 1 + update.go | 1 + 2 files changed, 2 insertions(+) diff --git a/libcontainer/cgroups/fs/paths.go b/libcontainer/cgroups/fs/paths.go index 1092331b25d..2cb970a3d55 100644 --- a/libcontainer/cgroups/fs/paths.go +++ b/libcontainer/cgroups/fs/paths.go @@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string { if err != nil { return "" } + defer dir.Close() names, err := dir.Readdirnames(1) if err != nil { return "" diff --git a/update.go b/update.go index 9ce5a2e835b..6d582ddddec 100644 --- a/update.go +++ b/update.go @@ -174,6 +174,7 @@ other options are ignored. if err != nil { return err } + defer f.Close() } err = json.NewDecoder(f).Decode(&r) if err != nil { From 0994249a5ec4e363bfcf9af58a87a722e9a3a31b Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 26 Dec 2023 23:53:07 +1100 Subject: [PATCH 2/9] init: verify after chdir that cwd is inside the container If a file descriptor of a directory in the host's mount namespace is leaked to runc init, a malicious config.json could use /proc/self/fd/... as a working directory to allow for host filesystem access after the container runs. This can also be exploited by a container process if it knows that an administrator will use "runc exec --cwd" and the target --cwd (the attacker can change that cwd to be a symlink pointing to /proc/self/fd/... and wait for the process to exec and then snoop on /proc/$pid/cwd to get access to the host). The former issue can lead to a critical vulnerability in Docker and Kubernetes, while the latter is a container breakout. We can (ab)use the fact that getcwd(2) on Linux detects this exact case, and getcwd(3) and Go's Getwd() return an error as a result. Thus, if we just do os.Getwd() after chdir we can easily detect this case and error out. In runc 1.1, a /sys/fs/cgroup handle happens to be leaked to "runc init", making this exploitable. On runc main it just so happens that the leaked /sys/fs/cgroup gets clobbered and thus this is only consistently exploitable for runc 1.1. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Co-developed-by: lifubang Signed-off-by: lifubang [refactored the implementation and added more comments] Signed-off-by: Aleksa Sarai --- libcontainer/init_linux.go | 31 ++++++++++++++++++++++++ libcontainer/integration/seccomp_test.go | 20 +++++++-------- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 5b88c71fc83..d9f18139f54 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -8,6 +8,7 @@ import ( "io" "net" "os" + "path/filepath" "strings" "unsafe" @@ -135,6 +136,32 @@ func populateProcessEnvironment(env []string) error { return nil } +// verifyCwd ensures that the current directory is actually inside the mount +// namespace root of the current process. +func verifyCwd() error { + // getcwd(2) on Linux detects if cwd is outside of the rootfs of the + // current mount namespace root, and in that case prefixes "(unreachable)" + // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect + // when this happens and return ENOENT rather than returning a non-absolute + // path. In both cases we can therefore easily detect if we have an invalid + // cwd by checking the return value of getcwd(3). See getcwd(3) for more + // details, and CVE-2024-21626 for the security issue that motivated this + // check. + // + // We have to use unix.Getwd() here because os.Getwd() has a workaround for + // $PWD which involves doing stat(.), which can fail if the current + // directory is inaccessible to the container process. + if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { + return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") + } else if err != nil { + return fmt.Errorf("failed to verify if current working directory is safe: %w", err) + } else if !filepath.IsAbs(wd) { + // We shouldn't ever hit this, but check just in case. + return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) + } + return nil +} + // finalizeNamespace drops the caps, sets the correct user // and working dir, and closes any leaked file descriptors // before executing the command inside the namespace @@ -193,6 +220,10 @@ func finalizeNamespace(config *initConfig) error { return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err) } } + // Make sure our final working directory is inside the container. + if err := verifyCwd(); err != nil { + return err + } if err := system.ClearKeepCaps(); err != nil { return fmt.Errorf("unable to clear keep caps: %w", err) } diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go index 31092a0a5d2..ecdfa7957df 100644 --- a/libcontainer/integration/seccomp_test.go +++ b/libcontainer/integration/seccomp_test.go @@ -13,7 +13,7 @@ import ( libseccomp "github.com/seccomp/libseccomp-golang" ) -func TestSeccompDenyGetcwdWithErrno(t *testing.T) { +func TestSeccompDenySyslogWithErrno(t *testing.T) { if testing.Short() { return } @@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ { - Name: "getcwd", + Name: "syslog", Action: configs.Errno, ErrnoRet: &errnoRet, }, @@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { buffers := newStdBuffers() pwd := &libcontainer.Process{ Cwd: "/", - Args: []string{"pwd"}, + Args: []string{"dmesg"}, Env: standardEnvironment, Stdin: buffers.Stdin, Stdout: buffers.Stdout, @@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { } if exitCode == 0 { - t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode) } - expected := "pwd: getcwd: No such process" + expected := "dmesg: klogctl: No such process" actual := strings.Trim(buffers.Stderr.String(), "\n") if actual != expected { t.Fatalf("Expected output %s but got %s\n", expected, actual) } } -func TestSeccompDenyGetcwd(t *testing.T) { +func TestSeccompDenySyslog(t *testing.T) { if testing.Short() { return } @@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) { DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ { - Name: "getcwd", + Name: "syslog", Action: configs.Errno, }, }, @@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) { buffers := newStdBuffers() pwd := &libcontainer.Process{ Cwd: "/", - Args: []string{"pwd"}, + Args: []string{"dmesg"}, Env: standardEnvironment, Stdin: buffers.Stdin, Stdout: buffers.Stdout, @@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) { } if exitCode == 0 { - t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode) } - expected := "pwd: getcwd: Operation not permitted" + expected := "dmesg: klogctl: Operation not permitted" actual := strings.Trim(buffers.Stderr.String(), "\n") if actual != expected { t.Fatalf("Expected output %s but got %s\n", expected, actual) From fbe3eed1e568a376f371d2ced1b4ac16b7d7adde Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 5 Jan 2024 01:42:32 +1100 Subject: [PATCH 3/9] setns init: do explicit lookup of execve argument early (This is a partial backport of a minor change included in commit dac41717465462b21fab5b5942fe4cb3f47d7e53.) This mirrors the logic in standard_init_linux.go, and also ensures that we do not call exec.LookPath in the final execve step. While this is okay for regular binaries, it seems exec.LookPath calls os.Getenv which tries to emit a log entry to the test harness when running in "go test" mode. In a future patch (in order to fix CVE-2024-21626), we will close all of the file descriptors immediately before execve, which would mean the file descriptor for test harness logging would be closed at execve time. So, moving exec.LookPath earlier is necessary. Ref: dac417174654 ("runc-dmz: reduce memfd binary cloning cost with small C binary") Signed-off-by: Aleksa Sarai --- libcontainer/setns_init_linux.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 09ab552b3d1..e891773ec57 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "os" + "os/exec" "strconv" "github.com/opencontainers/selinux/go-selinux" @@ -82,6 +83,21 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } + + // Check for the arg before waiting to make sure it exists and it is + // returned as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // exec.LookPath in Go < 1.20 might return no error for an executable + // residing on a file system mounted with noexec flag, so perform this + // extra check now while we can still return a proper error. + // TODO: remove this once go < 1.20 is not supported. + if err := eaccess(name); err != nil { + return &os.PathError{Op: "eaccess", Path: name, Err: err} + } + // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). @@ -101,5 +117,5 @@ func (l *linuxSetnsInit) Init() error { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } - return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + return system.Exec(name, l.config.Args[0:], os.Environ()) } From 284ba3057e428f8d6c7afcc3b0ac752e525957df Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 2 Jan 2024 14:58:28 +1100 Subject: [PATCH 4/9] init: close internal fds before execve If we leak a file descriptor referencing the host filesystem, an attacker could use a /proc/self/fd magic-link as the source for execve to execute a host binary in the container. This would allow the binary itself (or a process inside the container in the 'runc exec' case) to write to a host binary, leading to a container escape. The simple solution is to make sure we close all file descriptors immediately before the execve(2) step. Doing this earlier can lead to very serious issues in Go (as file descriptors can be reused, any (*os.File) reference could start silently operating on a different file) so we have to do it as late as possible. Unfortunately, there are some Go runtime file descriptors that we must not close (otherwise the Go scheduler panics randomly). The only way of being sure which file descriptors cannot be closed is to sneakily go:linkname the runtime internal "internal/poll.IsPollDescriptor" function. This is almost certainly not recommended but there isn't any other way to be absolutely sure, while also closing any other possible files. In addition, we can keep the logrus forwarding logfd open because you cannot execve a pipe and the contents of the pipe are so restricted (JSON-encoded in a format we pick) that it seems unlikely you could even construct shellcode. Closing the logfd causes issues if there is an error returned from execve. In mainline runc, runc-dmz protects us against this attack because the intermediate execve(2) closes all of the O_CLOEXEC internal runc file descriptors and thus runc-dmz cannot access them to attack the host. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai --- libcontainer/logs/logs.go | 9 ++++ libcontainer/setns_init_linux.go | 19 ++++++++ libcontainer/standard_init_linux.go | 19 ++++++++ libcontainer/utils/utils_unix.go | 72 +++++++++++++++++++++++++---- 4 files changed, 111 insertions(+), 8 deletions(-) diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index 95deb0d6ca7..349a18ed383 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -4,10 +4,19 @@ import ( "bufio" "encoding/json" "io" + "os" "github.com/sirupsen/logrus" ) +// IsLogrusFd returns whether the provided fd matches the one that logrus is +// currently outputting to. This should only ever be called by UnsafeCloseFrom +// from `runc init`. +func IsLogrusFd(fd uintptr) bool { + file, ok := logrus.StandardLogger().Out.(*os.File) + return ok && file.Fd() == fd +} + func ForwardLogs(logPipe io.ReadCloser) chan error { done := make(chan error, 1) s := bufio.NewScanner(logPipe) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index e891773ec57..d1bb12273c0 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) // linuxSetnsInit performs the container's initialization for running a new process @@ -117,5 +118,23 @@ func (l *linuxSetnsInit) Init() error { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index c09a7bed30e..d1d94352f93 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -17,6 +17,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -258,5 +259,23 @@ func (l *linuxStandardInit) Init() error { return err } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 220d0b43937..842f9b0a6d2 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -7,8 +7,11 @@ import ( "fmt" "os" "strconv" + _ "unsafe" // for go:linkname "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/logs" ) // EnsureProcHandle returns whether or not the given file handle is on procfs. @@ -23,9 +26,11 @@ func EnsureProcHandle(fh *os.File) error { return nil } -// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for -// the process (except for those below the given fd value). -func CloseExecFrom(minFd int) error { +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { fdDir, err := os.Open("/proc/self/fd") if err != nil { return err @@ -50,15 +55,66 @@ func CloseExecFrom(minFd int) error { if fd < minFd { continue } - // Intentionally ignore errors from unix.CloseOnExec -- the cases where - // this might fail are basically file descriptors that have already - // been closed (including and especially the one that was created when - // os.ReadDir did the "opendir" syscall). - unix.CloseOnExec(fd) + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) } return nil } +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We must not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + if logs.IsLogrusFd(uintptr(fd)) { + // Do not close the logrus output fd. We cannot exec a pipe, and + // the contents are quite limited (very little attacker control, + // JSON-encoded) making shellcode attacks unlikely. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + // NewSockPair returns a new unix socket pair func NewSockPair(name string) (parent *os.File, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) From b6633f48a8c970433737b9be5bfe4f25d58a5aa7 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 26 Dec 2023 23:36:51 +1100 Subject: [PATCH 5/9] cgroup: plug leaks of /sys/fs/cgroup handle We auto-close this file descriptor in the final exec step, but it's probably a good idea to not possibly leak the file descriptor to "runc init" (we've had issues like this in the past) especially since it is a directory handle from the host mount namespace. In practice, on runc 1.1 this does leak to "runc init" but on main the handle has a low enough file descriptor that it gets clobbered by the ForkExec of "runc init". OPEN_TREE_CLONE would let us protect this handle even further, but the performance impact of creating an anonymous mount namespace is probably not worth it. Also, switch to using an *os.File for the handle so if it goes out of scope during setup (i.e. an error occurs during setup) it will get cleaned up by the GC. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/file.go | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go index 48b263a1661..f6e1b73bd92 100644 --- a/libcontainer/cgroups/file.go +++ b/libcontainer/cgroups/file.go @@ -77,16 +77,16 @@ var ( // TestMode is set to true by unit tests that need "fake" cgroupfs. TestMode bool - cgroupFd int = -1 - prepOnce sync.Once - prepErr error - resolveFlags uint64 + cgroupRootHandle *os.File + prepOnce sync.Once + prepErr error + resolveFlags uint64 ) func prepareOpenat2() error { prepOnce.Do(func() { fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ - Flags: unix.O_DIRECTORY | unix.O_PATH, + Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC, }) if err != nil { prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} @@ -97,15 +97,16 @@ func prepareOpenat2() error { } return } + file := os.NewFile(uintptr(fd), cgroupfsDir) + var st unix.Statfs_t - if err = unix.Fstatfs(fd, &st); err != nil { + if err := unix.Fstatfs(int(file.Fd()), &st); err != nil { prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} logrus.Warnf("falling back to securejoin: %s", prepErr) return } - cgroupFd = fd - + cgroupRootHandle = file resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS if st.Type == unix.CGROUP2_SUPER_MAGIC { // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks @@ -132,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) { return openFallback(path, flags, mode) } - fd, err := unix.Openat2(cgroupFd, relPath, + fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath, &unix.OpenHow{ Resolve: resolveFlags, Flags: uint64(flags) | unix.O_CLOEXEC, @@ -140,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) { }) if err != nil { err = &os.PathError{Op: "openat2", Path: path, Err: err} - // Check if cgroupFd is still opened to cgroupfsDir + // Check if cgroupRootHandle is still opened to cgroupfsDir // (happens when this package is incorrectly used // across the chroot/pivot_root/mntns boundary, or // when /sys/fs/cgroup is remounted). // // TODO: if such usage will ever be common, amend this - // to reopen cgroupFd and retry openat2. - fdStr := strconv.Itoa(cgroupFd) + // to reopen cgroupRootHandle and retry openat2. + fdStr := strconv.Itoa(int(cgroupRootHandle.Fd())) fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) if fdDest != cgroupfsDir { - // Wrap the error so it is clear that cgroupFd + // Wrap the error so it is clear that cgroupRootHandle // is opened to an unexpected/wrong directory. - err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", - fdStr, fdDest, cgroupfsDir, err) + err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w", + cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err) } return nil, err } From 683ad2ff3b01fb142ece7a8b3829de17150cf688 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 28 Dec 2023 00:42:23 +1100 Subject: [PATCH 6/9] libcontainer: mark all non-stdio fds O_CLOEXEC before spawning init Given the core issue in GHSA-xr7r-f8xq-vfvv was that we were unknowingly leaking file descriptors to "runc init", it seems prudent to make sure we proactively prevent this in the future. The solution is to simply mark all non-stdio file descriptors as O_CLOEXEC before we spawn "runc init". For libcontainer library users, this could result in unrelated files being marked as O_CLOEXEC -- however (for the same reason we are doing this for runc), for security reasons those files should've been marked as O_CLOEXEC anyway. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai --- libcontainer/container_linux.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 59aa0338ac6..40b332f9810 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -353,6 +353,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) { }() } + // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC + // to make sure we don't leak any files into "runc init". Any files to be + // passed to "runc init" through ExtraFiles will get dup2'd by the Go + // runtime and thus their O_CLOEXEC flag will be cleared. This is some + // additional protection against attacks like CVE-2024-21626, by making + // sure we never leak files to "runc init" we didn't intend to. + if err := utils.CloseExecFrom(3); err != nil { + return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) + } if err := parent.start(); err != nil { return fmt.Errorf("unable to start container process: %w", err) } From e9665f4d606b64bf9c4652ab2510da368bfbd951 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 20 Jan 2024 16:56:38 +1100 Subject: [PATCH 7/9] init: don't special-case logrus fds We close the logfd before execve so there's no need to special case it. In addition, it turns out that (*os.File).Fd() doesn't handle the case where the file was closed and so it seems suspect to use that kind of check. Signed-off-by: Aleksa Sarai --- libcontainer/logs/logs.go | 9 --------- libcontainer/utils/utils_unix.go | 8 -------- 2 files changed, 17 deletions(-) diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index 349a18ed383..95deb0d6ca7 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -4,19 +4,10 @@ import ( "bufio" "encoding/json" "io" - "os" "github.com/sirupsen/logrus" ) -// IsLogrusFd returns whether the provided fd matches the one that logrus is -// currently outputting to. This should only ever be called by UnsafeCloseFrom -// from `runc init`. -func IsLogrusFd(fd uintptr) bool { - file, ok := logrus.StandardLogger().Out.(*os.File) - return ok && file.Fd() == fd -} - func ForwardLogs(logPipe io.ReadCloser) chan error { done := make(chan error, 1) s := bufio.NewScanner(logPipe) diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 842f9b0a6d2..bf3237a2911 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -10,8 +10,6 @@ import ( _ "unsafe" // for go:linkname "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/logs" ) // EnsureProcHandle returns whether or not the given file handle is on procfs. @@ -102,12 +100,6 @@ func UnsafeCloseFrom(minFd int) error { // don't have any choice. return } - if logs.IsLogrusFd(uintptr(fd)) { - // Do not close the logrus output fd. We cannot exec a pipe, and - // the contents are quite limited (very little attacker control, - // JSON-encoded) making shellcode attacks unlikely. - return - } // There's nothing we can do about errors from close(2), and the // only likely error to be seen is EBADF which indicates the fd was // already closed (in which case, we got what we wanted). From 51d5e94601ceffbbd85688df1c928ecccbfa4685 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 18 Jan 2024 12:17:28 +1100 Subject: [PATCH 8/9] VERSION: release 1.1.12 Signed-off-by: Aleksa Sarai --- CHANGELOG.md | 21 ++++++++++++++++++++- VERSION | 2 +- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7452f70faf0..70456291713 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased 1.1.z] +## [1.1.12] - 2024-01-31 + +> Now you're thinking with Portals™! + +### Security + +* Fix [CVE-2024-21626][cve-2024-21626], a container breakout attack that took + advantage of a file descriptor that was leaked internally within runc (but + never leaked to the container process). In addition to fixing the leak, + several strict hardening measures were added to ensure that future internal + leaks could not be used to break out in this manner again. Based on our + research, while no other container runtime had a similar leak, none had any + of the hardening steps we've introduced (and some runtimes would not check + for any file descriptors that a calling process may have leaked to them, + allowing for container breakouts due to basic user error). + +[cve-2024-21626]: https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv + ## [1.1.11] - 2024-01-01 > Happy New Year! @@ -493,7 +511,8 @@ implementation (libcontainer) is *not* covered by this policy. [1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1 -[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.11...release-1.1 +[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.12...release-1.1 +[1.1.12]: https://github.com/opencontainers/runc/compare/v1.1.11...v1.1.12 [1.1.11]: https://github.com/opencontainers/runc/compare/v1.1.10...v1.1.11 [1.1.10]: https://github.com/opencontainers/runc/compare/v1.1.9...v1.1.10 [1.1.9]: https://github.com/opencontainers/runc/compare/v1.1.8...v1.1.9 diff --git a/VERSION b/VERSION index 5aafe9bab6b..ccad953ac53 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.11+dev +1.1.12 From 29d6d87345aad2103a50848671617abb4273d621 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 20 Jan 2024 17:37:33 +1100 Subject: [PATCH 9/9] VERSION: back to development Signed-off-by: Aleksa Sarai --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ccad953ac53..9e5f80d9e8b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.12 +1.1.12+dev