diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 8b5709cf6a0..21c9905e69e 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -541,6 +541,38 @@ func (c *Container) shouldSendMountSources() bool { return false } +// shouldSendIdmapSources says whether the child process must setup idmap mounts with +// the mount_setattr already done in the host user namespace. +func (c *Container) shouldSendIdmapSources() bool { + // nsexec.c mount_setattr() requires CAP_SYS_ADMIN in: + // * the user namespace the filesystem was mounted in; + // * the user namespace we're trying to idmap the mount to; + // * the owning user namespace of the mount namespace you're currently located in. + // + // See the comment from Christian Brauner: + // https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972 + // + // Let's just rule out rootless, we don't have those permission in the + // rootless case. + if c.config.RootlessEUID { + return false + } + + // For the time being we require userns to be in use. + if !c.config.Namespaces.Contains(configs.NEWUSER) { + return false + } + + // We need to send sources if there are idmap bind-mounts. + for _, m := range c.config.Mounts { + if m.IsBind() && m.IsIDMapped() { + return true + } + } + + return false +} + func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error { if !c.shouldSendMountSources() { return nil @@ -551,6 +583,16 @@ func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) er }) } +func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error { + if !c.shouldSendIdmapSources() { + return nil + } + + return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { + return m.IsBind() && m.IsIDMapped() + }) +} + func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error { // Elements on these slices will be paired with mounts (see StartInitialization() and // prepareRootfs()). These slices MUST have the same size as c.config.Mounts. @@ -592,6 +634,9 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l if err := c.sendMountSources(cmd, messageSockPair); err != nil { return nil, err } + if err := c.sendIdmapSources(cmd, messageSockPair); err != nil { + return nil, err + } init := &initProcess{ cmd: cmd, @@ -2256,6 +2301,29 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa }) } + // Idmap mount sources to open. + if it == initStandard && c.shouldSendIdmapSources() { + var mounts []byte + for _, m := range c.config.Mounts { + if m.IsBind() && m.IsIDMapped() { + // While other parts of the code check this too (like + // libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer + // users don't use those functions. + if strings.IndexByte(m.Source, 0) >= 0 { + return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) + } + + mounts = append(mounts, []byte(m.Source)...) + } + mounts = append(mounts, byte(0)) + } + + r.AddData(&Bytemsg{ + Type: IdmapSourcesAttr, + Value: mounts, + }) + } + return bytes.NewReader(r.Serialize()), nil } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 4f6ed61c076..42cae1ccb65 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -48,12 +48,15 @@ type network struct { } type mountFds struct { - // Fds to use as source when mounting - // Size should be the same as container mounts, as it will be paired. + // sourceFds are the fds to use as source when mounting. + // The slice size should be the same as container mounts, as it will be + // paired with them. // The value -1 is used when no fd is needed for the mount. // Can't have a valid fd in the same position that other slices in this struct. // We need to use only one of these fds on any single mount. sourceFds []int + // Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount(). + idmapFds []int } // initConfig is used for transferring parameters from Exec() to Init() @@ -142,6 +145,12 @@ func StartInitialization() (retErr error) { return err } + // Get idmap fds. + idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS") + if err != nil { + return err + } + // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() @@ -157,7 +166,7 @@ func StartInitialization() (retErr error) { }() // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds}) + return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) } func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error { @@ -170,9 +179,9 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo } switch t { case initSetns: - // mountFds must be nil in this case. We don't mount while doing runc exec. - if mountFds.sourceFds != nil { - return errors.New("mount source fds must be nil; can't mount from exec") + // mount and idmap fds must be nil in this case. We don't mount while doing runc exec. + if mountFds.sourceFds != nil || mountFds.idmapFds != nil { + return errors.New("mount and idmap fds must be nil; can't mount from exec") } i := &linuxSetnsInit{ diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 6d1107e875d..17db81a29f3 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -22,6 +22,7 @@ const ( UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 MountSourcesAttr uint16 = 27290 + IdmapSourcesAttr uint16 = 27291 ) type Int32msg struct { diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 92583a995db..6297276f8b2 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -33,6 +33,9 @@ /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" +/* Get definitions for idmap sources */ +#include "idmap.h" + /* Synchronisation values. */ enum sync_t { SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ @@ -43,6 +46,8 @@ enum sync_t { SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */ SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */ + SYNC_MOUNT_IDMAP_PLS = 0x48, /* Tell parent to mount idmap sources. */ + SYNC_MOUNT_IDMAP_ACK = 0x49, /* All idmap mounts have been done. */ }; #define STAGE_SETUP -1 @@ -95,6 +100,10 @@ struct nlconfig_t { /* Mount sources opened outside the container userns. */ char *mountsources; size_t mountsources_len; + + /* Idmap sources opened outside the container userns which will be id mapped. */ + char *idmapsources; + size_t idmapsources_len; }; /* @@ -112,6 +121,7 @@ struct nlconfig_t { #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 #define MOUNT_SOURCES_ATTR 27290 +#define IDMAP_SOURCES_ATTR 27291 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -431,6 +441,10 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->mountsources = current; config->mountsources_len = payload_len; break; + case IDMAP_SOURCES_ATTR: + config->idmapsources = current; + config->idmapsources_len = payload_len; + break; default: bail("unknown netlink message type %d", nlattr->nla_type); } @@ -650,6 +664,83 @@ void try_unshare(int flags, const char *msg) bail("failed to unshare %s", msg); } +void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len) +{ + char proc_user_path[PATH_MAX]; + + /* Open the userns fd only once. + * Currently we only support idmap mounts that use the same mapping than + * the userns. This is validated in libcontainer/configs/validate/validator.go, + * so if we reached here, we know the mapping for the idmap is the same + * as the userns. This is why we just open the userns_fd once from the + * PID of the child process that has the userns already applied. + */ + int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) { + sane_kill(pid, SIGKILL); + bail("failed to create userns path string"); + } + + int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY); + if (userns_fd < 0) { + sane_kill(pid, SIGKILL); + bail("failed to get user namespace fd"); + } + + char *idmap_end = idmap_src + idmap_src_len; + while (idmap_src < idmap_end) { + if (idmap_src[0] == '\0') { + idmap_src++; + continue; + } + + int fd_tree = sys_open_tree(-EBADF, idmap_src, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | + AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT); + if (fd_tree < 0) { + sane_kill(pid, SIGKILL); + if (errno == EINVAL) + bail("failed to use open_tree(2) with path: %s, the kernel doesn't supports ID-mapped mounts", idmap_src); + else + bail("failed to use open_tree(2) with path: %s", idmap_src); + } + + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + .userns_fd = userns_fd, + }; + + ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr)); + if (ret < 0) { + sane_kill(pid, SIGKILL); + if (errno == EINVAL) + bail("failed to change mount attributes, maybe the filesystem doesn't supports ID-mapped mounts"); + else + bail("failed to change mount attributes"); + } + + write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path); + send_fd(sockfd, fd_tree); + + if (close(fd_tree) < 0) { + sane_kill(pid, SIGKILL); + bail("error closing fd_tree"); + } + + idmap_src += strlen(idmap_src) + 1; + } + + if (close(userns_fd) < 0) { + sane_kill(pid, SIGKILL); + bail("error closing userns fd"); + } +} + +void receive_idmapsources(int sockfd) +{ + receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS"); +} + void nsexec(void) { int pipenum; @@ -891,6 +982,17 @@ void nsexec(void) sane_kill(stage1_pid, SIGKILL); bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)"); } + break; + case SYNC_MOUNT_IDMAP_PLS: + write_log(DEBUG, "stage-1 requested to open idmap sources"); + send_idmapsources(syncfd, stage1_pid, config.idmapsources, + config.idmapsources_len); + s = SYNC_MOUNT_IDMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + sane_kill(stage1_pid, SIGKILL); + bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)"); + } + break; case SYNC_CHILD_FINISH: write_log(DEBUG, "stage-1 complete"); @@ -1062,6 +1164,21 @@ void nsexec(void) bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s); } + if (config.idmapsources) { + write_log(DEBUG, "request stage-0 to send idmap sources"); + s = SYNC_MOUNT_IDMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)"); + + /* Receive and install all idmap fds. */ + receive_idmapsources(syncfd); + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)"); + if (s != SYNC_MOUNT_IDMAP_ACK) + bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s); + } + /* * TODO: What about non-namespace clone flags that we're dropping here? * diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 9622798f0fd..edd3abd3c82 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -40,7 +40,8 @@ type mountConfig struct { // mountEntry contains mount data specific to a mount point. type mountEntry struct { *configs.Mount - srcFD string + srcFD string + idmapFD int } func (m *mountEntry) src() string { @@ -73,6 +74,10 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) ( return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds)) } + if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) { + return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds)) + } + mountConfig := &mountConfig{ root: config.Rootfs, label: config.MountLabel, @@ -81,13 +86,22 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) ( cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), } for i, m := range config.Mounts { - entry := mountEntry{Mount: m} + entry := mountEntry{Mount: m, idmapFD: -1} // Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts). // Therefore, we can access mountFds[i] without any concerns. if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 { entry.srcFD = "/proc/self/fd/" + strconv.Itoa(mountFds.sourceFds[i]) } + // We validated before we can access idmapFds[i]. + if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 { + entry.idmapFD = mountFds.idmapFds[i] + } + + if entry.idmapFD != -1 && entry.srcFD != "" { + return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i) + } + if err := mountToRootfs(mountConfig, entry); err != nil { return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) } @@ -466,8 +480,35 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { if err := prepareBindMount(m, rootfs); err != nil { return err } - if err := mountPropagate(m, rootfs, mountLabel); err != nil { - return err + + if m.IsBind() && m.IsIDMapped() { + if m.idmapFD == -1 { + return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m) + } + if err := unix.MoveMount(m.idmapFD, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil { + return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err) + } + + // In nsexec.c, we did not set the propagation field of mount_attr struct. + // So, let's deal with these flags right now! + if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error { + for _, pflag := range m.PropagationFlags { + // When using mount for setting propagations flags, the source, file + // system type and data arguments are ignored: + // https://man7.org/linux/man-pages/man2/mount.2.html + // We also ignore procfd because we want to act on dest. + if err := mountViaFDs("", "", dest, dstFD, "", uintptr(pflag), ""); err != nil { + return err + } + } + return nil + }); err != nil { + return fmt.Errorf("change mount propagation through procfd: %w", err) + } + } else { + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err + } } // bind mount won't change mount options, we need remount to make mount options effective. // first check that we have non-default options required before attempting a remount diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index b22b37ecdd6..f3d04282362 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -86,15 +86,14 @@ func (l *linuxStandardInit) Init() error { // initialises the labeling system selinux.GetEnabled() - // We don't need the mountFds.SourceFds after prepareRootfs() nor if it fails. + // We don't need the mount nor idmap fds after prepareRootfs() nor if it fails. err := prepareRootfs(l.pipe, l.config, l.mountFds) - for _, m := range l.mountFds.sourceFds { + for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) { if m == -1 { continue } - if err := unix.Close(m); err != nil { - return fmt.Errorf("unable to close mount sourceFds: %w", err) + return fmt.Errorf("unable to close mountFds fds: %w", err) } }