Skip to content

Commit

Permalink
Support Rootless Docker
Browse files Browse the repository at this point in the history
Requirements:
- Install rootless Docker 20.10 or later, see https://rootlesscontaine.rs/getting-started/docker/
- Enable cgroup v2 delegation, see https://rootlesscontaine.rs/getting-started/common/cgroup2/

Usage: `minikube start --driver=docker --container-runtime=containerd`.
The `--container-runtime` flag needs to be set to "containerd".
CRI-O can be also supported later.

Closes issue 10836 ("add support for rootless Docker").

Support for rootless Podman (issue 8719) is not covered in this commit.

---

Code reading guide:
- `deploy/kicbase/Dockerfile`: updated to install fuse-overlayfs and containerd-fuse-overlayfs, which is used
  instead of `overlayfs` snapshotter

- `deploy/kicbase/entrypoint`: updated to verify cgroup v2 delegation.
  Mostly from https://github.com/kubernetes-sigs/kind/blob/8a83ee46b28a80ccd47a85e24294b3e149361947/images/base/files/usr/local/bin/entrypoint

- `cmd/minikube/cmd/start_flags.go`: updated to set `KubeletInUserNamespace` feature gate when rootless

- `pkg/drivers/kic/oci`: updated to use port forwarding, because rootless container IPs are not reachable from the host

- `pkg/minikube/cruntime`: updated to generate `/etc/containerd/config.toml` with rootless support.

Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
  • Loading branch information
AkihiroSuda committed Aug 27, 2021
1 parent 0e9c49f commit 3232254
Show file tree
Hide file tree
Showing 17 changed files with 204 additions and 27 deletions.
28 changes: 28 additions & 0 deletions cmd/minikube/cmd/start_flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,9 +492,37 @@ func generateNewConfigFromFlags(cmd *cobra.Command, k8sVersion string, drvName s
}
}

if driver.IsKIC(drvName) {
si, err := oci.CachedDaemonInfo(drvName)
if err != nil {
exit.Message(reason.Usage, "Ensure your {{.driver_name}} is running and is healthy.", out.V{"driver_name": driver.FullName(drvName)})
}
if si.Rootless {
if cc.KubernetesConfig.ContainerRuntime != "containerd" {
exit.Message(reason.Usage, "Container runtime must be set to \"containerd\" for rootless")
// TODO: support cri-o (https://kubernetes.io/docs/tasks/administer-cluster/kubelet-in-userns/#configuring-cri)
}
// KubeletInUserNamespace feature gate is essential for rootless driver.
// See https://kubernetes.io/docs/tasks/administer-cluster/kubelet-in-userns/
cc.KubernetesConfig.FeatureGates = addFeatureGate(cc.KubernetesConfig.FeatureGates, "KubeletInUserNamespace=true")
}
}

return cc
}

func addFeatureGate(featureGates, s string) string {
split := strings.Split(featureGates, ",")
m := make(map[string]struct{}, len(split))
for _, v := range split {
m[v] = struct{}{}
}
if _, ok := m[s]; !ok {
split = append(split, s)
}
return strings.Join(split, ",")
}

func checkNumaCount(k8sVersion string) {
if viper.GetInt(kvmNUMACount) < 1 || viper.GetInt(kvmNUMACount) > 8 {
exit.Message(reason.Usage, "--kvm-numa-count range is 1-8")
Expand Down
19 changes: 18 additions & 1 deletion deploy/kicbase/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ RUN cd ./cmd/auto-pause/ && go build
FROM ubuntu:focal-20210401

ARG BUILDKIT_VERSION="v0.9.0"
ARG FUSE_OVERLAYFS_VERSION="v1.7.1"
ARG CONTAINERD_FUSE_OVERLAYFS_VERSION="1.0.3"

# copy in static files (configs, scripts)
COPY deploy/kicbase/10-network-security.conf /etc/sysctl.d/10-network-security.conf
Expand Down Expand Up @@ -113,14 +115,29 @@ RUN clean-install \
openssh-server \
dnsutils \
# libglib2.0-0 is required for conmon, which is required for podman
libglib2.0-0
libglib2.0-0 \
# fuse3 is required for fuse-overlayfs
fuse3

# install docker
RUN sh -c "echo 'deb https://download.docker.com/linux/ubuntu focal stable' > /etc/apt/sources.list.d/docker.list" && \
curl -L https://download.docker.com/linux/ubuntu/gpg -o docker.key && \
apt-key add - < docker.key && \
clean-install docker-ce docker-ce-cli containerd.io

# install fuse-overlayfs (used by rootless; apt-get version is old)
RUN curl -sSL --retry 5 --output /usr/local/bin/fuse-overlayfs https://github.com/containers/fuse-overlayfs/releases/download/${FUSE_OVERLAYFS_VERSION}/fuse-overlayfs-$(uname -m) \
&& chmod +x /usr/local/bin/fuse-overlayfs

# install containerd-fuse-overlayfs (used by rootless)
RUN export ARCH=$(dpkg --print-architecture | sed 's/ppc64el/ppc64le/' | sed 's/armhf/arm-v7/') \
&& echo "Installing containerd-fuse-overlayfs..." \
&& export CONTAINERD_FUSE_OVERLAYFS_BASE_URL="https://github.com/containerd/fuse-overlayfs-snapshotter/releases/download/v${CONTAINERD_FUSE_OVERLAYFS_VERSION}" \
&& curl -sSL --retry 5 --output /tmp/containerd-fuse-overlayfs.tgz "${CONTAINERD_FUSE_OVERLAYFS_BASE_URL}/containerd-fuse-overlayfs-${CONTAINERD_FUSE_OVERLAYFS_VERSION}-linux-${ARCH}.tar.gz" \
&& tar -C /usr/local/bin -xzvf /tmp/containerd-fuse-overlayfs.tgz \
&& rm -rf /tmp/containerd-fuse-overlayfs.tgz
COPY deploy/kicbase/containerd-fuse-overlayfs.service /etc/systemd/system/containerd-fuse-overlayfs.service

# install buildkit
RUN export ARCH=$(dpkg --print-architecture | sed 's/ppc64el/ppc64le/' | sed 's/armhf/arm-v7/') \
&& echo "Installing buildkit ..." \
Expand Down
13 changes: 13 additions & 0 deletions deploy/kicbase/containerd-fuse-overlayfs.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# From https://github.com/kubernetes-sigs/kind/blob/0d3780371091b2dc9ff6eea1b6054f14ff5d970a/images/base/files/etc/systemd/system/containerd-fuse-overlayfs.service
[Unit]
Description=containerd fuse-overlayfs snapshotter
PartOf=containerd.service

[Service]
ExecStart=/usr/local/bin/containerd-fuse-overlayfs-grpc /run/containerd-fuse-overlayfs.sock /var/lib/containerd-fuse-overlayfs
Type=notify
Restart=always
RestartSec=1

[Install]
WantedBy=multi-user.target
52 changes: 46 additions & 6 deletions deploy/kicbase/entrypoint
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,39 @@ set -o nounset
set -o pipefail
set -x

# If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host.
# Otherwise we are in a non-initial user namespace.
# https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118
userns=""
if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then
userns="1"
echo 'INFO: running in a user namespace (experimental)'
fi

validate_userns() {
if [[ -z "${userns}" ]]; then
return
fi

local nofile_hard
nofile_hard="$(ulimit -Hn)"
local nofile_hard_expected="64000"
if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then
echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2
fi

if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
echo "ERROR: UserNS: cgroup v2 needs to be enabled, see https://rootlesscontaine.rs/getting-started/common/cgroup2/" >&2
exit 1
fi
for f in cpu memory pids; do
if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then
echo "ERROR: UserNS: $f controller needs to be delegated, see https://rootlesscontaine.rs/getting-started/common/cgroup2/" >&2
exit 1
fi
done
}

configure_containerd() {
# we need to switch to the 'native' snapshotter on zfs
if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then
Expand Down Expand Up @@ -73,12 +106,16 @@ fix_mount() {
sync
fi

echo 'INFO: remounting /sys read-only'
# systemd-in-a-container should have read only /sys
# https://systemd.io/CONTAINER_INTERFACE/
# however, we need other things from `docker run --privileged` ...
# and this flag also happens to make /sys rw, amongst other things
mount -o remount,ro /sys
if [[ -z "${userns}" ]]; then
echo 'INFO: remounting /sys read-only'
# systemd-in-a-container should have read only /sys
# https://systemd.io/CONTAINER_INTERFACE/
# however, we need other things from `docker run --privileged` ...
# and this flag also happens to make /sys rw, amongst other things
#
# This step is skipped when running inside UserNS, because it fails with EACCES.
mount -o remount,ro /sys
fi

echo 'INFO: making mounts shared' >&2
# for mount propagation
Expand Down Expand Up @@ -334,6 +371,9 @@ enable_network_magic(){
fi
}

# validate state
validate_userns

# run pre-init fixups
# NOTE: it's important that we do configure* first in this order to avoid races
configure_containerd
Expand Down
2 changes: 1 addition & 1 deletion hack/preload-images/generate.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ func generateTarball(kubernetesVersion, containerRuntime, tarballFilename string
if err != nil {
return errors.Wrap(err, "failed create new runtime")
}
if err := cr.Enable(true, false); err != nil {
if err := cr.Enable(true, false, false); err != nil {
return errors.Wrap(err, "enable container runtime")
}

Expand Down
27 changes: 27 additions & 0 deletions pkg/drivers/kic/oci/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,33 @@ import (
// RoutableHostIPFromInside returns the ip/dns of the host that container lives on
// is routable from inside the container
func RoutableHostIPFromInside(ociBin string, clusterName string, containerName string) (net.IP, error) {
si, err := CachedDaemonInfo(ociBin)
if err != nil {
return nil, err
}
if si.Rootless {
if IsExternalDaemonHost(ociBin) {
return nil, fmt.Errorf("function RoutableHostIPFromInside is not implemented for external rootless daemons")
// TODO: parse DaemonHost()
}
addrs, err := net.InterfaceAddrs()
if err != nil {
return nil, err
}
for _, addr := range addrs {
var ip net.IP
switch v := addr.(type) {
case *net.IPAddr:
ip = v.IP
case *net.IPNet:
ip = v.IP
}
if ip != nil && !ip.IsLoopback() {
return ip, nil
}
}
return nil, fmt.Errorf("could not detect host IP, tried %v", addrs)
}
if ociBin == Docker {
if runtime.GOOS == "linux" {
info, err := containerNetworkInspect(ociBin, clusterName)
Expand Down
3 changes: 3 additions & 0 deletions pkg/drivers/kic/oci/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ func CreateContainerNode(p CreateParams) error {
// including some ones docker would otherwise do by default.
// for now this is what we want. in the future we may revisit this.
"--privileged",
// enable /dev/fuse explicitly for fuse-overlayfs
// (Rootless Docker does not automatically mount /dev/fuse with --privileged)
"--device", "/dev/fuse",
"--security-opt", "seccomp=unconfined", // ignore seccomp
"--tmpfs", "/tmp", // various things depend on working /tmp
"--tmpfs", "/run", // systemd wants a writable /run
Expand Down
29 changes: 25 additions & 4 deletions pkg/minikube/cruntime/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ oom_score = 0
[cgroup]
path = ""
[proxy_plugins]
# fuse-overlayfs is used for rootless
[proxy_plugins."fuse-overlayfs"]
type = "snapshot"
address = "/run/containerd-fuse-overlayfs.sock"
[plugins]
[plugins.cgroups]
no_prometheus = false
Expand All @@ -80,6 +86,7 @@ oom_score = 0
stats_collect_period = 10
enable_tls_streaming = false
max_container_log_line_size = 16384
restrict_oom_score_adj = {{ .RestrictOOMScoreAdj }}
[plugins."io.containerd.grpc.v1.cri"]
[plugins."io.containerd.grpc.v1.cri".containerd]
Expand All @@ -90,7 +97,7 @@ oom_score = 0
SystemdCgroup = {{ .SystemdCgroup }}
[plugins.cri.containerd]
snapshotter = "overlayfs"
snapshotter = "{{ .Snapshotter }}"
[plugins.cri.containerd.default_runtime]
runtime_type = "io.containerd.runc.v2"
[plugins.cri.containerd.untrusted_workload_runtime]
Expand Down Expand Up @@ -193,23 +200,31 @@ func (r *Containerd) Available() error {
}

// generateContainerdConfig sets up /etc/containerd/config.toml
func generateContainerdConfig(cr CommandRunner, imageRepository string, kv semver.Version, forceSystemd bool, insecureRegistry []string) error {
func generateContainerdConfig(cr CommandRunner, imageRepository string, kv semver.Version, forceSystemd bool, insecureRegistry []string, inUserNamespace bool) error {
cPath := containerdConfigFile
t, err := template.New("containerd.config.toml").Parse(containerdConfigTemplate)
if err != nil {
return err
}
pauseImage := images.Pause(kv, imageRepository)
snapshotter := "overlayfs"
if inUserNamespace {
snapshotter = "fuse-overlayfs"
}
opts := struct {
PodInfraContainerImage string
SystemdCgroup bool
InsecureRegistry []string
CNIConfDir string
RestrictOOMScoreAdj bool
Snapshotter string
}{
PodInfraContainerImage: pauseImage,
SystemdCgroup: forceSystemd,
InsecureRegistry: insecureRegistry,
CNIConfDir: cni.ConfDir,
RestrictOOMScoreAdj: inUserNamespace,
Snapshotter: snapshotter,
}
var b bytes.Buffer
if err := t.Execute(&b, opts); err != nil {
Expand All @@ -223,7 +238,7 @@ func generateContainerdConfig(cr CommandRunner, imageRepository string, kv semve
}

// Enable idempotently enables containerd on a host
func (r *Containerd) Enable(disOthers, forceSystemd bool) error {
func (r *Containerd) Enable(disOthers, forceSystemd, inUserNamespace bool) error {
if disOthers {
if err := disableOthers(r, r.Runner); err != nil {
klog.Warningf("disableOthers: %v", err)
Expand All @@ -232,13 +247,19 @@ func (r *Containerd) Enable(disOthers, forceSystemd bool) error {
if err := populateCRIConfig(r.Runner, r.SocketPath()); err != nil {
return err
}
if err := generateContainerdConfig(r.Runner, r.ImageRepository, r.KubernetesVersion, forceSystemd, r.InsecureRegistry); err != nil {
if err := generateContainerdConfig(r.Runner, r.ImageRepository, r.KubernetesVersion, forceSystemd, r.InsecureRegistry, inUserNamespace); err != nil {
return err
}
if err := enableIPForwarding(r.Runner); err != nil {
return err
}

if inUserNamespace {
if err := r.Init.EnableNow("containerd-fuse-overlayfs"); err != nil {
return err
}
}

// Otherwise, containerd will fail API requests with 'Unimplemented'
return r.Init.Restart("containerd")
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/minikube/cruntime/crio.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ func enableIPForwarding(cr CommandRunner) error {
}

// Enable idempotently enables CRIO on a host
func (r *CRIO) Enable(disOthers, _ bool) error {
func (r *CRIO) Enable(disOthers, _, inUserNamespace bool) error {
if inUserNamespace {
return errors.New("inUserNamespace must not be true for cri-o (yet)")
}
if disOthers {
if err := disableOthers(r, r.Runner); err != nil {
klog.Warningf("disableOthers: %v", err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/minikube/cruntime/cruntime.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ type Manager interface {
// Version retrieves the current version of this runtime
Version() (string, error)
// Enable idempotently enables this runtime on a host
Enable(bool, bool) error
Enable(bool, bool, bool) error
// Disable idempotently disables this runtime on a host
Disable() error
// Active returns whether or not a runtime is active on a host
Expand Down
2 changes: 1 addition & 1 deletion pkg/minikube/cruntime/cruntime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ func TestEnable(t *testing.T) {
if err != nil {
t.Fatalf("New(%s): %v", tc.runtime, err)
}
err = cr.Enable(true, false)
err = cr.Enable(true, false, false)
if err != nil {
t.Errorf("%s disable unexpected error: %v", tc.runtime, err)
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/minikube/cruntime/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,10 @@ func (r *Docker) Active() bool {
}

// Enable idempotently enables Docker on a host
func (r *Docker) Enable(disOthers, forceSystemd bool) error {
func (r *Docker) Enable(disOthers, forceSystemd, inUserNamespace bool) error {
if inUserNamespace {
return errors.New("inUserNamespace must not be true for docker")
}
containerdWasActive := r.Init.Active("containerd")

if disOthers {
Expand Down
10 changes: 9 additions & 1 deletion pkg/minikube/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,15 @@ func NeedsPortForward(name string) bool {
return true
}
// Docker for Desktop
return runtime.GOOS == "darwin" || runtime.GOOS == "windows" || detect.IsMicrosoftWSL()
if runtime.GOOS == "darwin" || runtime.GOOS == "windows" || detect.IsMicrosoftWSL() {
return true
}

si, err := oci.CachedDaemonInfo(name)
if err != nil {
panic(err)
}
return si.Rootless
}

// HasResourceLimits returns true if driver can set resource limits such as memory size or CPU count.
Expand Down
3 changes: 2 additions & 1 deletion pkg/minikube/node/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,8 @@ func configureRuntimes(runner cruntime.CommandRunner, cc config.ClusterConfig, k
}
}

err = cr.Enable(disableOthers, forceSystemd())
inUserNamespace := strings.Contains(cc.KubernetesConfig.FeatureGates, "KubeletInUserNamespace=true")
err = cr.Enable(disableOthers, forceSystemd(), inUserNamespace)
if err != nil {
exit.Error(reason.RuntimeEnable, "Failed to enable container runtime", err)
}
Expand Down
9 changes: 1 addition & 8 deletions pkg/minikube/registry/drvs/docker/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,7 @@ func status() (retState registry.State) {
return suggestFix("info", -1, serr, fmt.Errorf("docker info error: %s", serr))
}

if si.Rootless {
return registry.State{
Reason: "PROVIDER_DOCKER_ROOTLESS",
Error: errors.New("rootless Docker not supported yet"),
Installed: true,
Healthy: false,
Doc: "https://github.com/kubernetes/minikube/issues/10836"}
}
// TODO: validate cgroup v2 delegation when si.Rootless is true

return checkNeedsImprovement()
}
Expand Down
Loading

0 comments on commit 3232254

Please sign in to comment.