From a6c9285c4fae98453fe89799eab3a82bde341b73 Mon Sep 17 00:00:00 2001 From: YustasSwamp Date: Sun, 28 Jun 2015 22:57:35 -0700 Subject: [PATCH] Added contain sources --- .gitignore | 1 + Makefile | 17 +- tools/src/contain/COPYING | 19 +++ tools/src/contain/README | 304 ++++++++++++++++++++++++++++++++++++ tools/src/contain/TIPS | 93 +++++++++++ tools/src/contain/console.c | 154 ++++++++++++++++++ tools/src/contain/contain.c | 137 ++++++++++++++++ tools/src/contain/contain.h | 29 ++++ tools/src/contain/map.c | 222 ++++++++++++++++++++++++++ tools/src/contain/mount.c | 143 +++++++++++++++++ tools/src/contain/util.c | 71 +++++++++ 11 files changed, 1188 insertions(+), 2 deletions(-) create mode 100644 tools/src/contain/COPYING create mode 100644 tools/src/contain/README create mode 100644 tools/src/contain/TIPS create mode 100644 tools/src/contain/console.c create mode 100644 tools/src/contain/contain.c create mode 100644 tools/src/contain/contain.h create mode 100644 tools/src/contain/map.c create mode 100644 tools/src/contain/mount.c create mode 100644 tools/src/contain/util.c diff --git a/.gitignore b/.gitignore index 613be5e7d7..bc2470673b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ packer-cache/ stage/ discus-cache/ output-*/ +tools/bin/ diff --git a/Makefile b/Makefile index bf0407e4ef..ce5d364b42 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,8 @@ Makefile: ; include $(MAKEROOT)/makedefs.mk +export PATH := $(SRCROOT)/tools/bin:$(PATH) + ifdef PHOTON_CACHE_PATH PHOTON_PACKAGES := packages-cached else @@ -29,6 +31,9 @@ else PHOTON_PUBLISH_RPMS := publish-rpms endif +TOOLS_BIN := $(SRCROOT)/tools/bin +CONTAIN := $(TOOLS_BIN)/contain + .PHONY : all iso clean photon-build-machine photon-vagrant-build photon-vagrant-local \ check check-bison check-g++ check-gawk check-createrepo check-vagrant check-packer check-packer-ovf-plugin check-sanity \ clean-install clean-chroot @@ -59,7 +64,7 @@ iso: check $(PHOTON_STAGE) $(PHOTON_PACKAGES) -f > \ $(PHOTON_LOGS_DIR)/installer.log 2>&1 -packages: check $(PHOTON_PUBLISH_RPMS) $(PHOTON_SOURCES) +packages: check $(PHOTON_PUBLISH_RPMS) $(PHOTON_SOURCES) $(CONTAIN) @echo "Building all RPMS..." @cd $(PHOTON_PKG_BUILDER_DIR) && \ $(PHOTON_PACKAGE_BUILDER) -o full \ @@ -119,6 +124,8 @@ clean: clean-install clean-chroot @$(RMDIR) $(PHOTON_STAGE) @echo "Deleting chroot path..." @$(RMDIR) $(PHOTON_CHROOT_PATH) + @echo "Deleting tools/bin..." + @$(RMDIR) $(TOOLS_BIN) clean-install: @echo "Cleaning installer working directory..." @@ -203,7 +210,7 @@ endif check-packer-ovf-plugin: @[[ -e ~/.packer.d/plugins/packer-post-processor-vagrant-vmware-ovf ]] || { echo "Packer OVF post processor not installed. Aborting" >&2; exit 1; } -%: check $(PHOTON_PUBLISH_RPMS) $(PHOTON_SOURCES) +%: check $(PHOTON_PUBLISH_RPMS) $(PHOTON_SOURCES) $(CONTAIN) $(eval PKG_NAME = $@) @echo "Building package $(PKG_NAME) ..." @cd $(PHOTON_PKG_BUILDER_DIR) && \ @@ -214,3 +221,9 @@ check-packer-ovf-plugin: -x $(PHOTON_SRCS_DIR) \ -p $(PHOTON_PUBLISH_RPMS_DIR) \ -l $(PHOTON_LOGS_DIR) + +$(TOOLS_BIN): + mkdir -p $(TOOLS_BIN) + +$(CONTAIN): $(TOOLS_BIN) + gcc -O2 -std=gnu99 -Wall -Wextra $(SRCROOT)/tools/src/contain/*.c -o $@ diff --git a/tools/src/contain/COPYING b/tools/src/contain/COPYING new file mode 100644 index 0000000000..435291bb20 --- /dev/null +++ b/tools/src/contain/COPYING @@ -0,0 +1,19 @@ +Copyright (C) 2013 Chris Webb + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. diff --git a/tools/src/contain/README b/tools/src/contain/README new file mode 100644 index 0000000000..5c4e2b13ca --- /dev/null +++ b/tools/src/contain/README @@ -0,0 +1,304 @@ +Containers +========== + +This package is a simple implementation of containers for Linux, making +secure containers as easy to create and use as a traditional chroot. It +comprises three utilities, contain, inject and pseudo, which use the kernel +support for user namespaces merged in Linux 3.8. + + +Demonstration +------------- + +With the utilities already installed, the demo begins in an unprivileged +user's shell: + + $ echo $$ $UID + 21260 1000 + +To create a simple test container, copy /bin and /lib* from the host into a +temporary directory with the default UID/GID mappings applied: + + $ cd $(mktemp -d) + $ tar -c -f - -C / bin lib lib32 lib64 | pseudo tar -x -f - + +It is very straightforward to launch a container with this newly-created +root filesystem: + + $ contain . /bin/bash + # + +The new shell has PID 1 within the container, and cannot see other processes +on the host: + + # echo $$ $UID + 1 0 + # ps ax + PID TTY STAT TIME COMMAND + 1 console Ss 0:00 /bin/bash + 2 console R+ 0:00 ps ax + +The container root user is able to manipulate ownerships and permissions +within its filesystem: + + # ls -l /dev/console + crw--w---- 1 0 5 136, 9 Jul 1 14:00 /dev/console + # chown 12:34 /dev/console + # chmod a+rw /dev/console + # ls -l /dev/console + crw-rw-rw- 1 12 34 136, 9 Jul 1 14:00 /dev/console + +and can also make other privileged changes such as setting the hostname: + + # echo -n "hostname $(hostname) -> " && hostname brian && hostname + hostname alice -> brian + +or configuring the network stack: + + # ip link show + 1: lo: mtu 65536 qdisc noop state DOWN mode DEFAULT + link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 + # ping -w 1 1.2.3.4 &>/dev/null && echo up || echo down + down + # ip addr add 1.2.3.4/32 dev lo && ip link set lo up + # ping -w 1 1.2.3.4 &>/dev/null && echo up || echo down + up + # ip link add type veth && ip link show + 1: lo: mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT + link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 + 2: veth0: mtu 1500 qdisc noop state DOWN mode DEFAULT qlen 1000 + link/ether 3a:0c:96:36:2d:ff brd ff:ff:ff:ff:ff:ff + 3: veth1: mtu 1500 qdisc noop state DOWN mode DEFAULT qlen 1000 + link/ether a2:86:1a:92:58:cb brd ff:ff:ff:ff:ff:ff + +In all cases, these changes affect the container but not the host as a +whole. Processes in the container live in different resource namespaces +isolated from the host, and the container root user is unable to do anything +that would require elevated capabilities or root privilege on the host +itself. + + +contain +------- + +The contain utility is invoked as + + contain [OPTIONS] DIR [CMD [ARG]...] + +with options + + -c disable console emulation in the container + -g MAP set the container-to-host GID map + -i CMD run a helper child inside the new namespaces + -n share the host network unprivileged in the container + -o CMD run a helper child outside the new namespaces + -u MAP set the container-to-host UID map + +and creates a new container with DIR recursively bound as its root +filesystem, running CMD as PID 1 within that container. If unspecified, CMD +defaults to /bin/sh to start a shell, so to fully boot a distribution, +specify CMD as /bin/init or /sbin/init. + +The container init process is isolated in new user, mount, IPC, UTS, and PID +namespaces. A synthetic /dev with device nodes bound from the host /dev is +automatically mounted within the new mount namespace, together with standard +/dev/pts, /proc and /sys filesystems. + +Because it runs in its own user namespace, users and groups seen inside a +container are not the same as the underlying credentials visible for the +same processes and files on the host. Sensible default container-to-host UID +and GID mappings are provided and described below, but the -u and -g options +can be used to override the defaults. + +The container console is a host pseudo-terminal bound at /dev/console in the +new /dev filesystem: stdin and stdout are copied to/from this, and it serves +as stdin, stdout and stderr for the container init process. This console +emulation can be disabled using the -c option: if -c is used, init is run +directly with the stdin, stdout and stderr of the contain command. + +Containers are usually isolated in their own network namespace, with a +distinct set of network interfaces from the host. By specifying the -n +option, it is possible to safely share the host network stack instead. If +you do this, user networking within the container will work normally, but +the container has no privileges with respect to its network namespace so it +isn't possible to (re)configure interfaces or routes, and setuid utilities +like ping which use a raw socket will fail. + +Two different kinds of helper program can be used to help set up a +container. A program specified with -i is run inside the new namespaces with +the new root filesystem as its working directory, just before pivoting into +it. Typically this type of helper is used to bind-mount additional parts of +the host filesystem inside the container. + +A helper specified with -o is run outside the namespaces but as a direct +child of the supervisor process which is running within them. This type of +helper can be used to move host network interfaces (such as a macvtap +interface or one half of a veth pair) into the container's network +namespace. + +The environment of the container init process includes "container=contain" +so that distributions can identify when they are running under contain. + + +inject +------ + +The inject utility is invoked as + + inject PID [CMD [ARG]...] + +where PID is the process ID of a running container supervisor, and runs a +command or shell inside the existing container. The environment, stdin, +stdout and stderr of inject are all inherited by the command to be run. + +The container supervisor PID (i.e. that of contain itself) should be given +to inject, not the PID of the descendant init process. The inject utility +will only work if process specified has a child with "container=contain" +in its environment, which it assumes to be the container init. + +Linux allows an unprivileged user to join the user namespace of any +container started by his UID, so inject need not be installed setuid even if +contain and pseudo are setuid root. It will refuse to run if it detects +setuid/setgid operation. + + +pseudo +------ + +The pseudo utility is invoked as + + pseudo [OPTIONS] [CMD [ARG]...] + +with options + + -g MAP set the user namespace GID map + -u MAP set the user namespace UID map + +and runs a command or shell as root in a new user namespace, by analogy with +sudo which runs a command as root in the host user namespace. + +Unlike contain, pseudo does not unshare other namespaces or attempt to +isolate the new process from the rest of the host. It has identical default +UID/GID mappings, -u and -g options, and support for /etc/subuid and +/etc/subgid when installed setuid root, but no other contain options are +supported. + +One use for pseudo is as a more capable replacement for fakeroot, useful for +testing, when building software packages or for constructing system images. +Unlike the traditional fakeroot approach based on LD_PRELOAD, static +binaries and chroot jails are both handled correctly. + +It is also invaluable for running host software to access the same +filesystem as a container, replicating the user and group file ownerships +that the container would see. For example, in the demo above, the system +image is untarred under pseudo so that files are written into the filesystem +with UIDs and GIDs mapped for the container rather than unmapped as on the +host. + + +User and group mappings +----------------------- + +By default, when run as root, contain and pseudo will map container UID/GID +0 onto the highest available host UID/GID (4294967294 unless nested), and +all other UIDs/GIDs are mapped onto themselves apart from the top container +UID and GID which must be left unmapped. + +The default mappings avoid host UID and GID 0 as the host root user is still +granted a variety of privileges even after dropping all capabilities in the +host user namespace. For example, /proc and /sys files typically have (host) +root:root ownership, and allowing the container access unfiltered access to +things like /proc/sys is dangerous. + +Run as an unprivileged user, container UID/GID 0 is mapped onto the +unprivileged user's UID/GID, then container UIDs/GIDs 1, 2, etc. are +successively mapped onto any ranges delegated to that user in /etc/subuid +and /etc/subgid. + +The -u and -g options can be used to specify custom mappings, in the format +START:LOWER:COUNT[,START:LOWER:COUNT]... where START is the first UID/GID in +a container range, LOWER is the first UID/GID in the corresponding range in +the host, and COUNT is the length of these ranges. + +For example, -u 0:1000:1,1:4000:2000 will map container UID 0 onto host UID +1000 and container UIDs 1...2000 onto host UIDs 4000...5999. + +It is not possible to map more than one container ID onto a given host ID, +nor to list the same container ID twice in a map specification. When invoked +by an unprivileged user, all host ranges are checked against /etc/subuid and +/etc/subgid. + +Unmapped users and groups are mapped by the kernel onto the overflow UID and +GID set in /proc/sys/kernel/overflowuid and /proc/sys/kernel/overflowgid. By +default the kernel sets both these values to 65534. + + +Unprivileged operation, /etc/subuid and /etc/subgid +--------------------------------------------------- + +When a non-root user runs contain or pseudo unprivileged, these tools can +only map container UID/GIDs onto the host UID/GID of that user. The +resulting container is not very useful as it has just a single user and +group available. (Typically only root is mapped in the container.) + +However, contain and pseudo can also be installed setuid root, and in this +case, unprivileged users can also map onto ranges of UIDs/GIDs that have +been delegated for their use in /etc/subuid and /etc/subgid. + +The format of these files is similar to /etc/passwd, /etc/group and +/etc/shadow. Each line specifies an additional range of UIDs/GIDs allocated +to a particular user, and there can be zero, one, or multiple lines for any +given user. There are three colon-delimited fields: the user's login name, +the first UID/GID in the range, and the number of UIDs/GIDs in the range. +For example, an /etc/subuid containing the lines + + chris:100000:10000 + chris:120000:10000 + +allocates UID ranges 100000-109999 and 120000-129999 to my user 'chris' in +addition to my normal login UID. + +The kernel user namespace author Eric Biederman has +proposed patches against the standard GNU/Linux Shadow package which add +support for creating and updating these files in this format; they are +likely to become a standard way to delegate sub-users and sub-groups. + +Linux 3.19 and later do not allow unprivileged processes to write a GID map +unless the setgroups() call has been permanently disabled by writing "deny" +to /proc/PID/setgroups. This is a fix for CVE-2014-8989 which applied to +strangely-configured systems where group membership implies more restricted +permissions rather than supplementary permissions. + +As a result, when run non-setuid by an unprivileged user, contain and pseudo +must disable setgroups() in the container. Conversely, when installed setuid +root, they will use their privilege to bypass this kernel restriction, +resulting in fully-functional containers which still support setgroups(). +However, this also means that they can be used to bypass restrictions +implemented by group membership. + + +Building and installing +----------------------- + +Unpack the source tar.gz file and change to the unpacked directory. + +Run 'make', then 'make install' as root to install both binaries setuid root +in /bin. Alternatively, you can set DESTDIR and/or BINDIR to install in a +different location, or strip and copy the compiled binaries into the correct +place manually. + +Note that setuid contain and pseudo effectively enable unprivileged users to +to drop supplementary group memberships using setgroups(). Consequently, +they should NOT be installed setuid root on systems where group membership +implies more restricted permissions rather than supplementary permissions. + +These utilities were developed on GNU/Linux and are not portable to other +platforms as they rely on Linux-specific facilities such as namespaces. +Please report any problems or bugs to Chris Webb . + + +Copying +------- + +This software was written by Chris Webb and is +distributed as Free Software under the terms of the MIT license in COPYING. diff --git a/tools/src/contain/TIPS b/tools/src/contain/TIPS new file mode 100644 index 0000000000..da586d5741 --- /dev/null +++ b/tools/src/contain/TIPS @@ -0,0 +1,93 @@ +Shutting down or killing a container +------------------------------------ + +From the host, the inject utility can be used to run an appropriate command +within the container to start a graceful shut down. For example + + inject PID /bin/halt + +To immediately kill a container and all its processes, it is sufficient to +send the init process a SIGKILL from the host using + + pkill -KILL -P PID + +where PID is the process ID of a running container supervisor. It is very +important not to SIGKILL the container supervisor itself or the container +will be orphaned, continuing to run unsupervised as a child of the host +init. + + +Using cgroups to limit memory and CPU-share available to a container +-------------------------------------------------------------------- + +If cgroup support including memcg and memcg-swap is compiled into the kernel +and the cgroup filesystem is mounted with the cpu and memory controllers +enabled, it is straightforward to apply memory and CPU-share limits to a +container as it is started. For example, the shell script + + #!/bin/sh -e + mkdir /sys/fs/cgroup/mycontainer + echo $$ >/sys/fs/cgroup/mycontainer/tasks + echo 2G >/sys/fs/cgroup/mycontainer/memory.limit_in_bytes + echo 2G >/sys/fs/cgroup/mycontainer/memory.memsw.limit_in_bytes + echo 1000 >sys/fs/cgroup/mycontainer/cpu.shares + exec contain [...] + +applies a limit of 2GB virtual memory and a CPU-share of 1000 before +starting the container. It might also be useful to apply a +memory.kmem.limit_in_bytes setting to prevent a container from using +excessive amounts of kernel memory. + +Note that to set the virtual memory limit in memory.memsw.limit_in_bytes, it +is first necessary to set a smaller or equal physical memory limit in +memory.limit_in_bytes. + +When a container lives inside a memory cgroup, memory.memsw.usage_in_bytes +gives a measure of the total virtual memory in use by the container, and +memory.usage_in_bytes measures its physical memory footprint. The accounting +policy is explained in linux/kernel/Documentation/cgroups/memory.txt. + + +Troubleshooting +--------------- + +The contain/psuedo error message 'Failed to unshare user namespace: Invalid +argument' typically means that your kernel is not compiled with support for +user namespaces, i.e. CONFIG_USER_NS is not set. The contain tool will also +die with a similar message referring to one of the other required namespaces +if support for that is not available in the kernel. + +To run these tools you need to be running Linux 3.8 or later with + + CONFIG_UTS_NS=y + CONFIG_IPC_NS=y + CONFIG_USER_NS=y + CONFIG_PID_NS=y + CONFIG_NET_NS=y + +set in the kernel build config. Note that before Linux 3.12, CONFIG_XFS_FS +conflicted with CONFIG_USER_NS, so these tools could not be used where XFS +support was compiled either into the kernel or as a module. + +The contain tool will fail to mount /dev/pts unless + + CONFIG_DEVPTS_MULTIPLE_INSTANCES=y + +is set in the kernel build config. Both container and host /dev/pts must be +mounted with -o newinstance, with /dev/ptmx symlinked to pts/ptmx. + +Linux 3.12 introduced tighter restrictions on mounting proc and sysfs, which +broke older versions of contain. To comply with these new rules, contain +now ensures that procfs and sysfs are mounted in the new mount namespace +before pivoting into the container and detaching the host root. + +A bug in Linux 3.12 will prevent contain from mounting /proc in a container +if binfmt_misc is mounted on /proc/sys/fs/binfmt_misc in the host +filesystem. This was fixed in Linux 3.13. + +Linux 3.19 introduced restrictions on writing a user namespace GID map as an +unprivileged user unless setgroups() has been permanently disabled, which +broke older versions of contain. Run non-setuid and unprivileged, contain +and pseudo must now disable setgroups() to create containers, but if they +are installed setuid, they will bypass this kernel restriction and leave +setgroups() enabled in the resulting containers. diff --git a/tools/src/contain/console.c b/tools/src/contain/console.c new file mode 100644 index 0000000000..79c552aa6f --- /dev/null +++ b/tools/src/contain/console.c @@ -0,0 +1,154 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "contain.h" + +static struct termios saved; + +int getconsole(void) { + int master; + + if ((master = posix_openpt(O_RDWR | O_NOCTTY)) < 0) + error(1, 0, "Failed to allocate a console pseudo-terminal"); + grantpt(master); + unlockpt(master); + return master; +} + +static void rawmode() { + struct termios termios; + + if (!isatty(STDIN_FILENO)) + return; + if (tcgetattr(STDIN_FILENO, &termios) < 0) + error(1, errno, "tcgetattr"); + cfmakeraw(&termios); + tcsetattr(STDIN_FILENO, TCSANOW, &termios); +} + +static void restoremode() { + if (isatty(STDIN_FILENO)) + tcsetattr(STDIN_FILENO, TCSANOW, &saved); +} + +static void savemode() { + if (isatty(STDIN_FILENO) && tcgetattr(STDIN_FILENO, &saved) < 0) + error(1, errno, "tcgetattr"); +} + +void setconsole(char *name) { + int console; + struct termios termios; + + setsid(); + + if ((console = open(name, O_RDWR)) < 0) + error(1, 0, "Failed to open console in container"); + ioctl(console, TIOCSCTTY, NULL); + + if (tcgetattr(console, &termios) < 0) + error(1, errno, "tcgetattr"); + termios.c_iflag |= IGNBRK | IUTF8; + tcsetattr(console, TCSANOW, &termios); + + dup2(console, STDIN_FILENO); + dup2(console, STDOUT_FILENO); + dup2(console, STDERR_FILENO); + if (console != STDIN_FILENO) + if (console != STDOUT_FILENO) + if (console != STDERR_FILENO) + close(console); +} + +int supervise(pid_t child, int console) { + char buffer[PIPE_BUF]; + int signals, status; + sigset_t mask; + ssize_t count, length, offset; + struct pollfd fds[3]; + + if (console < 0) { + if (waitpid(child, &status, 0) < 0) + error(1, errno, "waitpid"); + return WIFEXITED(status) ? WEXITSTATUS(status) : EXIT_FAILURE; + } + + sigemptyset(&mask); + sigaddset(&mask, SIGCHLD); + sigprocmask(SIG_BLOCK, &mask, NULL); + if ((signals = signalfd(-1, &mask, 0)) < 0) + error(1, errno, "signalfd"); + + if (waitpid(child, &status, WNOHANG) > 0) + if (WIFEXITED(status) || WIFSIGNALED(status)) + raise(SIGCHLD); + + savemode(); + atexit(restoremode); + rawmode(); + + fds[0].fd = console; + fds[0].events = POLLIN; + fds[1].fd = STDIN_FILENO; + fds[1].events = POLLIN; + fds[2].fd = signals; + fds[2].events = POLLIN; + + while (1) { + if (poll(fds, 3, -1) < 0) + if (errno != EAGAIN && errno != EINTR) + error(1, errno, "poll"); + + if (fds[0].revents & (POLLIN | POLLHUP)) { + while ((length = read(console, buffer, sizeof(buffer))) < 0) + if (errno != EAGAIN && errno != EINTR) + error(1, errno, "read"); + if (length > 0) { + for (offset = 0; length > 0; offset += count, length -= count) + while ((count = write(STDOUT_FILENO, buffer + offset, length)) < 0) + if (errno != EAGAIN && errno != EINTR) + error(1, errno, "write"); + } else { + fds[0].events = 0; + } + } + + if (fds[1].revents & (POLLIN | POLLHUP)) { + while ((length = read(STDIN_FILENO, buffer, sizeof(buffer))) < 0) + if (errno != EAGAIN && errno != EINTR) + error(1, errno, "read"); + if (length > 0) { + for (offset = 0; length > 0; offset += count, length -= count) + while ((count = write(console, buffer + offset, length)) < 0) + if (errno != EAGAIN && errno != EINTR) + error(1, errno, "write"); + } else { + fds[1].events = 0; + } + } + + if (fds[2].revents & POLLIN) { + while (read(signals, buffer, sizeof(buffer)) < 0) + if (errno != EAGAIN && errno != EINTR) + error(1, errno, "read"); + if (waitpid(child, &status, WNOHANG) > 0) + if (WIFEXITED(status) || WIFSIGNALED(status)) + break; + } + } + + close(signals); + return WIFEXITED(status) ? WEXITSTATUS(status) : EXIT_FAILURE; +} diff --git a/tools/src/contain/contain.c b/tools/src/contain/contain.c new file mode 100644 index 0000000000..18a3019061 --- /dev/null +++ b/tools/src/contain/contain.c @@ -0,0 +1,137 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "contain.h" + +void usage(char *progname) { + fprintf(stderr, "\ +Usage: %s [OPTIONS] DIR [CMD [ARG]...]\n\ +Options:\n\ + -b BND bind host path into container\n\ + -c disable console emulation in the container\n\ + -g MAP set the container-to-host GID map\n\ + -i CMD run a helper child inside the new namespaces\n\ + -n share the host network unprivileged in the container\n\ + -o CMD run a helper child outside the new namespaces\n\ + -u MAP set the container-to-host UID map\n\ +BND is specified as HOST_DIR:CONTAINER_DIR[,HOST_DIR2:CONTAINER_DIR2]...\n\ +GID and UID maps are specified as START:LOWER:COUNT[,START:LOWER:COUNT]...\n\ +", progname); + exit(EX_USAGE); +} + +int main(int argc, char **argv) { + char *gidmap = NULL, *inside = NULL, *outside = NULL, *uidmap = NULL; + char *bind = NULL; + int hostnet = 0, master, option, stdio = 0; + pid_t child, parent; + + while ((option = getopt(argc, argv, "+:b:cg:i:no:u:")) > 0) + switch (option) { + case 'b': + bind = optarg; + break; + case 'c': + stdio++; + break; + case 'g': + gidmap = optarg; + break; + case 'i': + inside = optarg; + break; + case 'n': + hostnet++; + break; + case 'o': + outside = optarg; + break; + case 'u': + uidmap = optarg; + break; + default: + usage(argv[0]); + } + + if (argc <= optind) + usage(argv[0]); + + parent = getpid(); + switch (child = fork()) { + case -1: + error(1, errno, "fork"); + case 0: + raise(SIGSTOP); +// if (geteuid() != 0) +// denysetgroups(parent); + writemap(parent, GID, gidmap); + writemap(parent, UID, uidmap); + + if (outside) { + if (setgid(getgid()) < 0 || setuid(getuid()) < 0) + error(1, 0, "Failed to drop privileges"); + execlp(SHELL, SHELL, "-c", outside, NULL); + error(1, errno, "exec %s", outside); + } + + exit(EXIT_SUCCESS); + } + + if (setgid(getgid()) < 0 || setuid(getuid()) < 0) + error(1, 0, "Failed to drop privileges"); + + if (unshare(CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWUTS) < 0) + error(1, 0, "Failed to unshare namespaces"); + + if (!hostnet && unshare(CLONE_NEWNET) < 0) + error(1, 0, "Failed to unshare network namespace"); + + waitforstop(child); + kill(child, SIGCONT); + waitforexit(child); + + setgid(0); + setgroups(0, NULL); + setuid(0); + + master = stdio ? -1 : getconsole(); + createroot(argv[optind], master, inside, bind); + + unshare(CLONE_NEWPID); + switch (child = fork()) { + case -1: + error(1, errno, "fork"); + case 0: + mountproc(); + if (!hostnet) + mountsys(); + enterroot(); + + if (master >= 0) { + close(master); + setconsole("/dev/console"); + } + + clearenv(); + putenv("container=contain"); + + if (argv[optind + 1]) + execv(argv[optind + 1], argv + optind + 1); + else + execl(SHELL, SHELL, NULL); + error(1, errno, "exec"); + } + + return supervise(child, master); +} diff --git a/tools/src/contain/contain.h b/tools/src/contain/contain.h new file mode 100644 index 0000000000..53741248c4 --- /dev/null +++ b/tools/src/contain/contain.h @@ -0,0 +1,29 @@ +#ifndef CONTAIN_H +#define CONTAIN_H + +#define GID 0 +#define UID 1 +#define INVALID ((unsigned) -1) +#define SHELL "/bin/sh" + +#define getid(type) ((unsigned) ((type) == GID ? getgid() : getuid())) +#define idfile(type) ((type) == GID ? "gid_map" : "uid_map") +#define idname(type) ((type) == GID ? "GID" : "UID") +#define subpath(type) ((type) == GID ? "/etc/subgid" : "/etc/subuid") + +extern char *append(char **destination, const char *format, ...); +extern void createroot(char *src, int console, char *helper, char *bind); +extern void denysetgroups(pid_t pid); +extern void enterroot(void); +extern int getconsole(void); +extern void mountproc(void); +extern void mountsys(void); +extern void setconsole(char *name); +extern char *string(const char *format, ...); +extern int supervise(pid_t child, int console); +extern char *tmpdir(void); +extern void waitforstop(pid_t child); +extern void waitforexit(pid_t child); +extern void writemap(pid_t pid, int type, char *map); + +#endif diff --git a/tools/src/contain/map.c b/tools/src/contain/map.c new file mode 100644 index 0000000000..4a0727c1b4 --- /dev/null +++ b/tools/src/contain/map.c @@ -0,0 +1,222 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "contain.h" + +void denysetgroups(pid_t pid) { + char *path, *text = "deny"; + int fd; + + path = string("/proc/%d/setgroups", pid); + if ((fd = open(path, O_WRONLY)) < 0) + error(1, 0, "Failed to disable setgroups() in container"); + else if (write(fd, text, strlen(text)) != (ssize_t) strlen(text)) + error(1, 0, "Failed to disable setgroups() in container"); + close(fd); + free(path); +} + +static char *getmap(pid_t pid, int type) { + char *line = NULL, *result = NULL, *path; + size_t size; + unsigned count, first, lower; + FILE *file; + + if (pid == -1) + path = string("/proc/self/%s", idfile(type)); + else + path = string("/proc/%d/%s", pid, idfile(type)); + if (!(file = fopen(path, "r"))) + error(1, 0, "Cannot read %s", path); + + while (getline(&line, &size, file) >= 0) { + if (sscanf(line, " %u %u %u", &first, &lower, &count) != 3) + error(1, 0, "Invalid map data in %s", path); + append(&result, "%s%u:%u:%u", result ? "," : "", first, lower, count); + } + + if (!result) + error(1, 0, "Invalid map data in %s", path); + + fclose(file); + free(line); + free(path); + return result; +} + +static char *mapitem(char *map, unsigned *first, unsigned *lower, + unsigned *count) { + ssize_t skip; + + while (map && *map && strchr(",;", *map)) + map++; + if (map == NULL || *map == '\0') + return NULL; + if (sscanf(map, "%u:%u:%u%zn", first, lower, count, &skip) < 3) + error(1, 0, "Invalid ID map '%s'", map); + return map + skip; +} + +static char *rangeitem(char *range, unsigned *start, unsigned *length) { + ssize_t skip; + + while (range && *range && strchr(",;", *range)) + range++; + if (range == NULL || *range == '\0') + return NULL; + if (sscanf(range, "%u:%u%zn", start, length, &skip) < 2) + error(1, 0, "Invalid ID range '%s'", range); + return range + skip; +} + +static char *readranges(int type) { + char *line = NULL, *range, *user; + size_t end, size; + struct passwd *passwd; + unsigned length, start; + FILE *file; + + range = string("%u:1", getid(type)); + if (!(file = fopen(subpath(type), "r"))) + return range; + + user = getenv("USER"); + user = user ? user : getenv("LOGNAME"); + user = user ? user : getlogin(); + if (!user || !(passwd = getpwnam(user)) || passwd->pw_uid != getuid()) { + if (!(passwd = getpwuid(getuid()))) + error(1, 0, "Failed to validate your username"); + user = passwd->pw_name; + } + endpwent(); + + while (getline(&line, &size, file) >= 0) { + if (strncmp(line, user, strlen(user))) + continue; + if (sscanf(line + strlen(user), ":%u:%u%zn", &start, &length, &end) < 2) + continue; + if (strchr(":\n", line[end + strlen(user) + 1])) + append(&range, ",%u:%u", start, length); + } + + free(line); + fclose(file); + return range; +} + +static char *rootdefault(int type) { + char *cursor, *map, *result; + unsigned count, first, last = INVALID, lower; + + cursor = map = getmap(-1, type); + while ((cursor = mapitem(cursor, &first, &lower, &count))) + if (last == INVALID || last < first + count - 1) + last = first + count - 1; + result = string("0:%u:1", last); + + cursor = map; + while ((cursor = mapitem(cursor, &first, &lower, &count))) { + if (first == 0) { + if (count == 1 && first >= last) + error(1, 0, "No unprivileged %s available\n", idname(type)); + first++, lower++, count--; + } + + if (last <= first + count - 1 && count > 0) + count--; + + if (count > 0) + append(&result, "%s%u:%u:%u", result ? "," : "", first, first, count); + } + + free(map); + return result; +} + +static char *userdefault(int type) { + char *cursor, *map, *range, *result = NULL; + unsigned count, first, index = 0, length, lower, start; + + if (geteuid() != 0) + return string("0:%u:1", getid(type)); + + map = getmap(-1, type); + range = readranges(type); + + while ((range = rangeitem(range, &start, &length))) { + cursor = map; + while ((cursor = mapitem(cursor, &first, &lower, &count))) { + if (start + length <= first || first + count <= start) + continue; + if (first + count < start + length) + length = start - first + count; + if (start < first) { + index += first - start; + length -= first - start; + start = first; + } + append(&result, "%s%u:%u:%u", result ? "," : "", index, start, length); + index += length; + } + } + + free(map); + free(range); + return result; +} + +static void validate(char *range, unsigned first, unsigned count) { + unsigned length, start; + + while ((range = rangeitem(range, &start, &length))) + if (first < start + length && start < first + count) { + if (first < start) + validate(range, first, start - first); + if (first + count > start + length) + validate(range, start + length, first + count - start - length); + return; + } + error(1, 0, "Cannot map onto IDs that are not delegated to you"); +} + +static void verifymap(char *map, char *range) { + unsigned count, first, lower; + + while ((map = mapitem(map, &first, &lower, &count))) + validate(range, lower, count); +} + +void writemap(pid_t pid, int type, char *map) { + char *path, *range, *text = NULL; + int fd; + unsigned count, first, lower; + + if (!map) { + map = (getuid() == 0 ? rootdefault : userdefault)(type); + } else if (getuid() != 0) { + range = readranges(type); + verifymap(map, range); + free(range); + } + + while ((map = mapitem(map, &first, &lower, &count))) + append(&text, "%u %u %u\n", first, lower, count); + + path = string("/proc/%d/%s", pid, idfile(type)); + if ((fd = open(path, O_WRONLY)) < 0) + error(1, 0, "Failed to set container %s map", idname(type)); + else if (write(fd, text, strlen(text)) != (ssize_t) strlen(text)) + error(1, 0, "Failed to set container %s map", idname(type)); + + close(fd); + free(path); + free(text); +} diff --git a/tools/src/contain/mount.c b/tools/src/contain/mount.c new file mode 100644 index 0000000000..51612cb016 --- /dev/null +++ b/tools/src/contain/mount.c @@ -0,0 +1,143 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "contain.h" + +static char *root; + +static void bindnode(char *src, char *dst) { + int fd; + + if ((fd = open(dst, O_WRONLY | O_CREAT, 0600)) >= 0) + close(fd); + if (mount(src, dst, NULL, MS_BIND, NULL) < 0) + error(1, 0, "Failed to bind '%s' into '%s'", src, dst); +} + +void cleanup(void) { + if (root) { + umount2(root, MNT_DETACH); + rmdir(root); + } +} + +static char *binditem(char *b, char **s, char **d) { + char *orig = b; + + while (b && *b && strchr(",;", *b)) + b++; + if (b == NULL || *b == '\0') + return NULL; + *s = b; + while (*b && *b != ':') + b++; + if (*b != ':') + error(1, 0, "Invalid bind format '%s'", orig); + *b++ = '\0'; + *d = b; + while (*b && !strchr(",;:", *b)) + b++; + if (*b == ':') + error(1, 0, "Invalid bind format '%s'", orig); + if (*b) + *b++ = '\0'; + return b; +} + +void createroot(char *src, int console, char *helper, char *bind) { + mode_t mask; + pid_t child; + char *bindsrc = NULL, *binddst = NULL; + + root = tmpdir(); + atexit(cleanup); + + if (mount(src, root, NULL, MS_BIND | MS_REC, NULL) < 0) + error(1, 0, "Failed to bind new root filesystem"); + else if (chdir(root) < 0) + error(1, 0, "Failed to enter new root filesystem"); + + mask = umask(0); + mkdir("dev" , 0755); + if (mount("tmpfs", "dev", "tmpfs", 0, "mode=0755") < 0) + error(1, 0, "Failed to mount /dev tmpfs in new root filesystem"); + + mkdir("dev/pts", 0755); + if (mount("devpts", "dev/pts", "devpts", 0, "newinstance,ptmxmode=666") < 0) + error(1, 0, "Failed to mount /dev/pts in new root filesystem"); + + mkdir("dev/tmp", 0755); + umask(mask); + + if (console >= 0) + bindnode(ptsname(console), "dev/console"); + bindnode("/dev/full", "dev/full"); + bindnode("/dev/null", "dev/null"); + bindnode("/dev/random", "dev/random"); + bindnode("/dev/tty", "dev/tty"); + bindnode("/dev/urandom", "dev/urandom"); + bindnode("/dev/zero", "dev/zero"); + symlink("pts/ptmx", "dev/ptmx"); + + while ((bind = binditem(bind, &bindsrc, &binddst))) + bindnode(bindsrc, binddst); + + if (helper) + switch (child = fork()) { + case -1: + error(1, errno, "fork"); + case 0: + execlp(SHELL, SHELL, "-c", helper, NULL); + error(1, errno, "exec %s", helper); + default: + waitforexit(child); + } +} + +void enterroot(void) { + if (syscall(__NR_pivot_root, ".", "dev/tmp") < 0) + error(1, 0, "Failed to pivot into new root filesystem"); + + if (chdir("/dev/tmp") >= 0) { + while (*root == '/') + root++; + rmdir(root); + } + + root = NULL; + + if (chdir("/") < 0 || umount2("/dev/tmp", MNT_DETACH) < 0) + error(1, 0, "Failed to detach old root filesystem"); + else + rmdir("/dev/tmp"); +} + +void mountproc(void) { + mode_t mask; + + mask = umask(0); + mkdir("proc" , 0755); + umask(mask); + + if (mount("proc", "proc", "proc", 0, NULL) < 0) + error(1, 0, "Failed to mount /proc in new root filesystem"); +} + +void mountsys(void) { + mode_t mask; + + mask = umask(0); + mkdir("sys" , 0755); + umask(mask); + + if (mount("sysfs", "sys", "sysfs", 0, NULL) < 0) + error(1, 0, "Failed to mount /sys in new root filesystem"); +} diff --git a/tools/src/contain/util.c b/tools/src/contain/util.c new file mode 100644 index 0000000000..85f0b1e6ea --- /dev/null +++ b/tools/src/contain/util.c @@ -0,0 +1,71 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "contain.h" + +char *append(char **destination, const char *format, ...) { + char *extra, *result; + va_list args; + + va_start(args, format); + if (vasprintf(&extra, format, args) < 0) + error(1, errno, "asprintf"); + va_end(args); + + if (*destination == NULL) { + *destination = extra; + return extra; + } + + if (asprintf(&result, "%s%s", *destination, extra) < 0) + error(1, errno, "asprintf"); + free(*destination); + free(extra); + *destination = result; + return result; +} + +char *string(const char *format, ...) { + char *result; + va_list args; + + va_start(args, format); + if (vasprintf(&result, format, args) < 0) + error(1, errno, "asprintf"); + va_end(args); + return result; +} + +char *tmpdir(void) { + char *dir; + + if (!(dir = strdup("/tmp/XXXXXX"))) + error(1, errno, "strdup"); + else if (!mkdtemp(dir)) + error(1, 0, "Failed to create temporary directory"); + return dir; +} + +void waitforexit(pid_t child) { + int status; + + if (waitpid(child, &status, 0) < 0) + error(1, errno, "waitpid"); + else if (WEXITSTATUS(status) != EXIT_SUCCESS) + exit(WEXITSTATUS(status)); +} + +void waitforstop(pid_t child) { + int status; + + if (waitpid(child, &status, WUNTRACED) < 0) + error(1, errno, "waitpid"); + if (!WIFSTOPPED(status)) + exit(WEXITSTATUS(status)); +}