diff --git a/.buildkite/pipeline.yaml b/.buildkite/pipeline.yaml index ac9bebb239..8704f43a95 100644 --- a/.buildkite/pipeline.yaml +++ b/.buildkite/pipeline.yaml @@ -267,6 +267,12 @@ steps: agents: cgroup: "v2" os: "ubuntu" + - <<: *common + label: ":podman: Podman" + command: sudo ./test/podman/run.sh + agents: + cgroup: "v2" + os: "ubuntu" # Check the website builds. - <<: *common diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 977fdf27d9..a20baeb974 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -48,6 +48,8 @@ type Do struct { ip string quiet bool overlay bool + uidMap idMapSlice + gidMap idMapSlice } // Name implements subcommands.Command.Name. @@ -72,6 +74,44 @@ used for testing only. ` } +type idMapSlice []specs.LinuxIDMapping + +// String implements flag.Value.String. +func (is *idMapSlice) String() string { + return fmt.Sprintf("%#v", is) +} + +// Get implements flag.Value.Get. +func (is *idMapSlice) Get() interface{} { + return is +} + +// Set implements flag.Value.Set. +func (is *idMapSlice) Set(s string) error { + fs := strings.Fields(s) + if len(fs) != 3 { + return fmt.Errorf("invalid mapping: %s", s) + } + var cid, hid, size int + var err error + if cid, err = strconv.Atoi(fs[0]); err != nil { + return fmt.Errorf("invalid mapping: %s", s) + } + if hid, err = strconv.Atoi(fs[1]); err != nil { + return fmt.Errorf("invalid mapping: %s", s) + } + if size, err = strconv.Atoi(fs[2]); err != nil { + return fmt.Errorf("invalid mapping: %s", s) + } + m := specs.LinuxIDMapping{ + ContainerID: uint32(cid), + HostID: uint32(hid), + Size: uint32(size), + } + *is = append(*is, m) + return nil +} + // SetFlags implements subcommands.Command.SetFlags. func (c *Do) SetFlags(f *flag.FlagSet) { f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`) @@ -79,6 +119,8 @@ func (c *Do) SetFlags(f *flag.FlagSet) { f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox") f.BoolVar(&c.quiet, "quiet", false, "suppress runsc messages to stdout. Application output is still sent to stdout and stderr") f.BoolVar(&c.overlay, "force-overlay", true, "use an overlay. WARNING: disabling gives the command write access to the host") + f.Var(&c.uidMap, "uid-map", "Add a user id mapping [ContainerID, HostID, Size]") + f.Var(&c.gidMap, "gid-map", "Add a group id mapping [ContainerID, HostID, Size]") } // Execute implements subcommands.Command.Execute. @@ -129,6 +171,12 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000)) + if c.uidMap != nil { + addNamespace(spec, specs.LinuxNamespace{Type: specs.UserNamespace}) + spec.Linux.UIDMappings = c.uidMap + spec.Linux.GIDMappings = c.gidMap + } + if conf.Network == config.NetworkNone { addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace}) } else if conf.Rootless { diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 17adc9bf8d..38295dd282 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -18,8 +18,10 @@ import ( "context" "encoding/json" "fmt" + "io" "os" "path/filepath" + "runtime" "runtime/debug" "strings" @@ -63,8 +65,9 @@ type Gofer struct { applyCaps bool setUpRoot bool - specFD int - mountsFD int + specFD int + mountsFD int + syncUsernsFD int } // Name implements subcommands.Command. @@ -92,6 +95,7 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. They must follow this order: root first, then mounts as defined in the spec") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") + f.IntVar(&g.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.") } // Execute implements subcommands.Command. @@ -113,6 +117,26 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) util.Fatalf("reading spec: %v", err) } + if g.syncUsernsFD >= 0 { + f := os.NewFile(uintptr(g.syncUsernsFD), "sync FD") + defer f.Close() + var b [1]byte + if n, err := f.Read(b[:]); n != 0 || err != io.EOF { + util.Fatalf("failed to sync: %v: %v", n, err) + } + + f.Close() + // SETUID changes UID on the current system thread, so we have + // to re-execute current binary. + runtime.LockOSThread() + if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 { + util.Fatalf("failed to set UID: %v", errno) + } + if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 { + util.Fatalf("failed to set GID: %v", errno) + } + } + if g.setUpRoot { if err := setupRootFS(spec, conf); err != nil { util.Fatalf("Error setting up root FS: %v", err) @@ -122,7 +146,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Disable caps when calling myself again. // Note: minimal argument handling for the default case to keep it simple. args := os.Args - args = append(args, "--apply-caps=false", "--setup-root=false") + args = append(args, "--apply-caps=false", "--setup-root=false", "--sync-userns-fd=-1") util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps)) panic("unreachable") } diff --git a/runsc/container/container.go b/runsc/container/container.go index 2c3f94318e..f90fec0a97 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -971,15 +971,49 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu {Type: specs.UTSNamespace}, } + rootlessEUID := unix.Getuid() != 0 + var syncFile *os.File // Setup any uid/gid mappings, and create or join the configured user // namespace so the gofer's view of the filesystem aligns with the // users in the sandbox. - userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) - nss = append(nss, userNS...) - specutils.SetUIDGIDMappings(cmd, spec) - if len(userNS) != 0 { - // We need to set UID and GID to have capabilities in a new user namespace. - cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} + if !rootlessEUID { + userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) + nss = append(nss, userNS...) + specutils.SetUIDGIDMappings(cmd, spec) + if len(userNS) != 0 { + // We need to set UID and GID to have capabilities in a new user namespace. + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} + } + } else { + userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) + if len(userNS) == 0 { + return nil, nil, fmt.Errorf("unable to run a rootless container without userns") + } + fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + syncFile = os.NewFile(uintptr(fds[0]), "sync FD") + defer syncFile.Close() + + f := os.NewFile(uintptr(fds[1]), "sync other FD") + donations.DonateAndClose("sync-userns-fd", f) + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &unix.SysProcAttr{} + } + cmd.SysProcAttr.AmbientCaps = []uintptr{ + unix.CAP_CHOWN, + unix.CAP_DAC_OVERRIDE, + unix.CAP_DAC_READ_SEARCH, + unix.CAP_FOWNER, + unix.CAP_FSETID, + unix.CAP_SYS_CHROOT, + unix.CAP_SETUID, + unix.CAP_SETGID, + unix.CAP_SYS_ADMIN, + unix.CAP_SETPCAP, + } + nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) } donations.Transfer(cmd, nextFD) @@ -990,6 +1024,43 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu if err := specutils.StartInNS(cmd, nss); err != nil { return nil, nil, fmt.Errorf("gofer: %v", err) } + + if rootlessEUID { + log.Debugf("Setting user mappings") + args := []string{strconv.Itoa(cmd.Process.Pid)} + for _, idMap := range spec.Linux.UIDMappings { + log.Infof("Mapping host uid %d to container uid %d (size=%d)", + idMap.HostID, idMap.ContainerID, idMap.Size) + args = append(args, + strconv.Itoa(int(idMap.ContainerID)), + strconv.Itoa(int(idMap.HostID)), + strconv.Itoa(int(idMap.Size)), + ) + } + + out, err := exec.Command("newuidmap", args...).CombinedOutput() + log.Debugf("newuidmap: %#v\n%s", args, out) + if err != nil { + return nil, nil, fmt.Errorf("newuidmap failed: %w", err) + } + + args = []string{strconv.Itoa(cmd.Process.Pid)} + for _, idMap := range spec.Linux.GIDMappings { + log.Infof("Mapping host uid %d to container uid %d (size=%d)", + idMap.HostID, idMap.ContainerID, idMap.Size) + args = append(args, + strconv.Itoa(int(idMap.ContainerID)), + strconv.Itoa(int(idMap.HostID)), + strconv.Itoa(int(idMap.Size)), + ) + } + out, err = exec.Command("newgidmap", args...).CombinedOutput() + log.Debugf("newgidmap: %#v\n%s", args, out) + if err != nil { + return nil, nil, fmt.Errorf("newgidmap failed: %w", err) + } + } + log.Infof("Gofer started, PID: %d", cmd.Process.Pid) c.GoferPid = cmd.Process.Pid c.goferIsChild = true diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index ca4bf58d9f..9e31b4ebbc 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -18,6 +18,7 @@ package sandbox import ( "context" "encoding/json" + "errors" "fmt" "io" "math" @@ -536,6 +537,7 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn donations := donation.Agency{} defer donations.Close() + rootlessEUID := unix.Getuid() != 0 // // These flags must come BEFORE the "boot" command in cmd.Args. // @@ -722,13 +724,13 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid()) log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") - } else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { + } else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { log.Infof("Sandbox will be started in new user namespace") nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) cmd.Args = append(cmd.Args, "--setup-root") const nobody = 65534 - if conf.Rootless { + if rootlessEUID || conf.Rootless { log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) } else { // Map nobody in the new namespace to nobody in the parent namespace. @@ -1419,6 +1421,10 @@ func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error for _, file := range stdios { log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID) if err := file.Chown(s.UID, s.GID); err != nil { + if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) { + log.Warningf("can't change an owner of %s: %s", file.Name(), err) + continue + } return err } } diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index 21559f5e5f..cdf108b4b4 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -119,7 +119,7 @@ func setNS(fd, nsType uintptr) error { // that will restore the namespace to the original value. // // Preconditions: Must be called with os thread locked. -func ApplyNS(ns specs.LinuxNamespace) (func(), error) { +func ApplyNS(ns specs.LinuxNamespace) (func() error, error) { log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path) newNS, err := os.Open(ns.Path) if err != nil { @@ -140,27 +140,49 @@ func ApplyNS(ns specs.LinuxNamespace) (func(), error) { oldNS.Close() return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) } - return func() { + return func() error { log.Infof("Restoring namespace %v", ns.Type) defer oldNS.Close() if err := setNS(oldNS.Fd(), flag); err != nil { - panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err)) + return fmt.Errorf("error restoring namespace: of type %v: %v", ns.Type, err) } + return nil }, nil } // StartInNS joins or creates the given namespaces and calls cmd.Start before // restoring the namespaces to the original values. func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { - // We are about to setup namespaces, which requires the os thread being - // locked so that Go doesn't change the thread out from under us. - runtime.LockOSThread() - defer runtime.UnlockOSThread() + errChan := make(chan error) + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + rstFuncs, err := startInNS(cmd, nss) + errChan <- err + for _, rstFunc := range rstFuncs { + err := rstFunc() + if err == nil { + continue + } + + // One or more namespaces have not been restored, but + // we can't destroy the current system thread, because + // a child process is execited with Pdeathsig. + log.Debugf("Block the current system thread due to: %s", err) + c := make(chan interface{}) + <-c + } + }() + return <-errChan +} +func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) ([]func() error, error) { if cmd.SysProcAttr == nil { cmd.SysProcAttr = &unix.SysProcAttr{} } + var deferFuncs []func() error for _, ns := range nss { if ns.Path == "" { // No path. Just set a flag to create a new namespace. @@ -171,12 +193,12 @@ func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { // before exiting. restoreNS, err := ApplyNS(ns) if err != nil { - return err + return deferFuncs, err } - defer restoreNS() + deferFuncs = append(deferFuncs, restoreNS) } - return cmd.Start() + return deferFuncs, cmd.Start() } // SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. diff --git a/test/podman/run.sh b/test/podman/run.sh new file mode 100755 index 0000000000..d0dc8426cf --- /dev/null +++ b/test/podman/run.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Copyright 2022 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -u -x -e -o pipefail + +export DEBIAN_FRONTEND=noninteractive +sudo -E apt-get install -qy podman + +test_dir="$(mktemp -d /tmp/gvisor-podman.XXXXXX)" +podman_runtime="${test_dir}/runsc.podman" + +cleanup() { + rm -rf "${test_dir}" +} +trap cleanup EXIT + +make copy TARGETS=runsc DESTINATION="${test_dir}" +cat > "${podman_runtime}" <