Skip to content

Commit

Permalink
Support Rootless Docker, with vanilla Kubernetes
Browse files Browse the repository at this point in the history
Signed-off-by: Akihiro Suda <[email protected]>
  • Loading branch information
AkihiroSuda committed Nov 18, 2020
1 parent abd0bfd commit 3304c25
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 6 deletions.
1 change: 1 addition & 0 deletions images/base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ RUN echo "Ensuring scripts are executable ..." \
libseccomp2 pigz \
bash ca-certificates curl rsync \
nfs-common \
jq \
&& find /lib/systemd/system/sysinit.target.wants/ -name "systemd-tmpfiles-setup.service" -delete \
&& rm -f /lib/systemd/system/multi-user.target.wants/* \
&& rm -f /etc/systemd/system/*.wants/* \
Expand Down
4 changes: 4 additions & 0 deletions images/base/files/etc/containerd/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ version = 2
default_runtime_name = "runc"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
BinaryName = "runc"

# Setup a runtime with the magic name ("test-handler") used for Kubernetes
# runtime class tests ...
Expand All @@ -20,3 +22,5 @@ version = 2
tolerate_missing_hugepages_controller = true
# explicitly use default snapshotter so we can sed it in entrypoint
snapshotter = "overlayfs"
# restrict_oom_score_adj needs to be true when running inside UserNS (rootless)
restrict_oom_score_adj = false
99 changes: 93 additions & 6 deletions images/base/files/usr/local/bin/entrypoint
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,91 @@ set -o errexit
set -o nounset
set -o pipefail

userns=""
if egrep -qv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then
userns="1"
echo 'INFO: running in user namespace (experimental)'
fi

validate_userns() {
if [[ -z "${userns}" ]]; then
return
fi
local nofile_hard="$(ulimit -Hn)"
local nofile_hard_expected="64000"
if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then
# This ERROR can be demoted to WARNING when k/k PR gets merged: https://github.com/kubernetes/kubernetes/pull/92863
echo "ERROR: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2
exit 1
fi
local dmesg_restrict="$(cat /proc/sys/kernel/dmesg_restrict)"
if [[ "${dmesg_restrict}" != "0" ]]; then
echo "ERROR: expected kernel.dmesg_restrict to be 0, got ${dmesg_restrict}" >&2
exit 1
fi
}

fake_file_with_content(){
local path="$1"
local content="$2"
local base="/run/fake"
local fake_path="${base}/${path}"
mkdir -p "$(dirname "${fake_path}")"
echo "INFO: faking ${path} to be \"${content}\" (writable)"
echo "${content}" > "${fake_path}"
mount --bind "${fake_path}" "${path}"
}

fake_sysctl() {
local key="$1"
local key_slash="$(echo "${key}" | sed -e s@\\.@/@g)"
local path="/proc/sys/${key_slash}"
if [[ -f "${path}" ]]; then
local content="$(cat "${path}")"
fake_file_with_content "${path}" "${content}"
fi
}

configure_containerd() {
# we need to switch to the 'native' snapshotter on zfs
if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then
sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml
fi

# userns (rootless) configs
if [[ -n "$userns" ]]; then
# Adjust oomScoreAdj
sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml

# mounting overlayfs inside userns requires patching kernel.
# Ubuntu kernel is patched by default.
# Debian kernel is patched by default as well, but Debian needs `sudo modprobe overlay permit_mounts_in_userns=1`.
local tmp=$(mktemp -d)
mkdir -p "${tmp}"/{l,u,w,m}
if mount -t overlay overlay -o "lowerdir=${tmp}/l,upperdir=${tmp}/u,workdir=${tmp}/w" "${tmp}/m"; then
umount "${tmp}/m"
else
echo 'INFO: this kernel does not support mounting overlayfs inside userns. Disabling overlayfs'
sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml
fi
rm -rf "${tmp}"

# To run vanilla kubelet and kube-proxy inside UserNS, we need to fake several unwritable sysctl to be writable.
# Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream.
fake_sysctl "vm.overcommit_memory"
fake_sysctl "vm.panic_on_oom"
fake_sysctl "kernel.panic"
fake_sysctl "kernel.panic_on_oops"
fake_sysctl "kernel.keys.root_maxkeys"
fake_sysctl "kernel.keys.root_maxbytes"
fake_sysctl "net.netfilter.nf_conntrack_max"
fake_sysctl "net.netfilter.nf_conntrack_tcp_timeout_established"
fake_sysctl "net.netfilter.nf_conntrack_tcp_timeout_close_wait"

# Wrap runc to mount fake "/sys/module/nf_conntrack/parameters/hashsize" for kube-proxy.
# Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream.
sed -i 's/BinaryName = "runc"/BinaryName = "userns-ociwrapper"/' /etc/containerd/config.toml
fi
}

configure_proxy() {
Expand Down Expand Up @@ -50,12 +130,16 @@ fix_mount() {
sync
fi

echo 'INFO: remounting /sys read-only'
# systemd-in-a-container should have read only /sys
# https://systemd.io/CONTAINER_INTERFACE/
# however, we need other things from `docker run --privileged` ...
# and this flag also happens to make /sys rw, amongst other things
mount -o remount,ro /sys
if [[ -z "${userns}" ]]; then
echo 'INFO: remounting /sys read-only'
# systemd-in-a-container should have read only /sys
# https://systemd.io/CONTAINER_INTERFACE/
# however, we need other things from `docker run --privileged` ...
# and this flag also happens to make /sys rw, amongst other things
#
# This step is skipped when running inside UserNS, because it fails with EACCES.
mount -o remount,ro /sys
fi

echo 'INFO: making mounts shared' >&2
# for mount propagation
Expand Down Expand Up @@ -239,6 +323,9 @@ enable_network_magic(){
fi
}

# validate state
validate_userns

# run pre-init fixups
# NOTE: it's important that we do configure* first in this order to avoid races
configure_containerd
Expand Down
51 changes: 51 additions & 0 deletions images/base/files/usr/local/bin/userns-ociwrapper
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# Copyright 2020 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -o errexit
set -o nounset
set -o pipefail

RUNTIME="runc"

bundle="."
bundle_flag=""
# FIXME: support `--bundle=STRING` as well
for f in $@; do
if [[ -n $bundle_flag ]]; then
bundle=$f
break
else
case $f in
-b | --bundle)
bundle_flag=$f
;;
esac
fi
done

if [ -f $bundle/config.json ]; then
# kube-proxy wants to read "/sys/module/nf_conntrack/parameters/hashsize", but it fails with EACCES when running inside userns.
# So we bind-mount a fake file.
# Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged
echo "65536" >"/run/nf_conntrack_fake_hashsize"
q='.mounts += [{"destination": "/sys/module/nf_conntrack/parameters/hashsize", "source": "/run/nf_conntrack_fake_hashsize", "type": "none", "options": ["bind"]}]'
tmp=$(mktemp -d ociwrapper.XXXXXXXX)
jq "$q" <$bundle/config.json >$tmp/config.json
mv $tmp/config.json $bundle/config.json
rm -rf $tmp
fi

exec "$RUNTIME" "$@"
44 changes: 44 additions & 0 deletions site/content/docs/user/quick-start.md
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,50 @@ The structure of the logs will look more or less like this:
The logs contain information about the Docker host, the containers running
kind, the Kubernetes cluster itself, etc.

### Rootless Docker

Starting with kind 0.10.0 and Docker 20.10, Rootless Docker can be used as the node provider of kind.

#### Host requirements
The host needs to be running with cgroup v2.

cgroup v2 is enabled by default on Fedora.
On other distros, cgroup v2 can be typically enabled by adding `GRUB_CMDLINE_LINUX="systemd.unified_cgroup_hierarchy=1"` to `/etc/default/grub` and
running `sudo update-grub`.

Also, depending on the host configuration, the following steps might be needed:

- Create `/etc/systemd/system/[email protected]/delegate.conf` with the following content, and then run `sudo systemctl daemon-reload`:
```
[Service]
Delegate=yes
EOF
```

- Create `/etc/sysctl.d/99-rootless.conf` with the following content, and then run `sudo sysctl --system`:
```
kernel.dmesg_restrict=0
```

#### Restrictions

The restrictions of Rootless Docker applies to kind clusters as well.

e.g.
- OverlayFS cannot be used unless the host is Ubuntu or Debian
- Cannot mount block storages
- Cannot mount NFS

#### Creating a kind cluster with Rootless Docker

To create a kind cluster with Rootless Docker, just run `kind create cluster` command with
`DOCKER_HOST=unix://${XDG_RUNTIME_DIR}/docker.sock`.

```console
$ export DOCKER_HOST=unix://${XDG_RUNTIME_DIR}/docker.sock
$ kind create cluster
```

[go-supported]: https://golang.org/doc/devel/release.html#policy
[known issues]: /docs/user/known-issues
[releases]: https://github.com/kubernetes-sigs/kind/releases
Expand Down

0 comments on commit 3304c25

Please sign in to comment.