From 6adc0720b2e66d3dee7e115d93ec3347f9a8a212 Mon Sep 17 00:00:00 2001 From: Etienne Perot Date: Fri, 1 Nov 2024 15:36:08 -0700 Subject: [PATCH] runsc: When mounting a new procfs fails, fall back to recursive bind-mount. As part of sandbox startup, `runsc` needs to set up a chroot environment with a minimal working `procfs` filesystem mounted within. However, doing so from within a container (as applications like Dangerzone do) may fail, because in the container runtime's default configuration, some paths of the procfs filesystem visible from within the container may be obstructed. This prevents mounting new unobstructed instances of `procfs`. This change detects this case and falls back to the previous behavior of using a recursive bind-mount of `/proc` in such a case. The obstructed subdirectories of procfs are preserved in this case, which is fine because we only need a very minimal subset of `procfs` to actually work. Additionally, `runsc` actually only needs a few kernel parameter files and `/proc/self` in order to work. So this change sets up a `tmpfs` mount that contains just those files, with the kernel parameter files being plainly copied and `/proc/self` being a symlink to the one present in the mounted view of `procfs` (regardless of which mounting method was used). The `runtime_in_docker` test will continuously verify that this fallback mechanism works to avoid similar breakage in the future. Credits to @avagin for figuring out this solution. Fixes #10944. PiperOrigin-RevId: 692310347 --- pkg/coretag/coretag.go | 8 +++- pkg/coretag/coretag_test.go | 11 ++++- runsc/boot/loader_test.go | 2 +- runsc/cmd/boot.go | 2 +- runsc/cmd/chroot.go | 72 ++++++++++++++++++++++++++++-- runsc/specutils/namespace.go | 2 +- test/e2e/runtime_in_docker_test.go | 12 ----- 7 files changed, 89 insertions(+), 20 deletions(-) diff --git a/pkg/coretag/coretag.go b/pkg/coretag/coretag.go index 98e1485a97..9574b23617 100644 --- a/pkg/coretag/coretag.go +++ b/pkg/coretag/coretag.go @@ -41,6 +41,7 @@ func Enable() error { } // GetAllCoreTags returns the core tag of all the threads in the thread group. +// PID 0 means the current pid. func GetAllCoreTags(pid int) ([]uint64, error) { // prctl(PR_SCHED_CORE_GET, PR_SCHED_CORE_SCOPE_THREAD_GROUP, ...) is not supported // in linux. So instead we get all threads from /proc//task and get all the @@ -75,9 +76,14 @@ func GetAllCoreTags(pid int) ([]uint64, error) { } // getTids returns set of tids as reported by /proc//task. +// PID 0 means the current PID. func getTids(pid int) (map[int]struct{}, error) { tids := make(map[int]struct{}) - files, err := os.ReadDir("/proc/" + strconv.Itoa(pid) + "/task") + path := "/proc/self/task" + if pid != 0 { + path = fmt.Sprintf("/proc/%d/task", pid) + } + files, err := os.ReadDir(path) if err != nil { return nil, err } diff --git a/pkg/coretag/coretag_test.go b/pkg/coretag/coretag_test.go index 1930716c47..a9458499f2 100644 --- a/pkg/coretag/coretag_test.go +++ b/pkg/coretag/coretag_test.go @@ -16,6 +16,7 @@ package coretag import ( "os" + "reflect" "testing" "gvisor.dev/gvisor/pkg/hostos" @@ -36,11 +37,19 @@ func TestEnable(t *testing.T) { t.Fatalf("Enable() got error %v, wanted nil", err) } - coreTags, err := GetAllCoreTags(os.Getpid()) + pid := os.Getpid() + coreTags, err := GetAllCoreTags(pid) if err != nil { t.Fatalf("GetAllCoreTags() got error %v, wanted nil", err) } if len(coreTags) != 1 { t.Fatalf("Got coreTags %v, wanted len(coreTags)=1", coreTags) } + coreTagsSelf, err := GetAllCoreTags(0) + if err != nil { + t.Fatalf("GetAllCoreTags(0) got error %v, wanted nil", err) + } + if !reflect.DeepEqual(coreTags, coreTagsSelf) { + t.Fatalf("Got different coreTags for PID %d vs self: %v vs %v", pid, coreTags, coreTagsSelf) + } } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index ab99ee5c2e..1fb54321a4 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -245,7 +245,7 @@ func TestStartSignal(t *testing.T) { func TestHostnetWithRawSockets(t *testing.T) { // Drop CAP_NET_RAW from effective capabilities, if we have it. pid := os.Getpid() - caps, err := capability.NewPid2(os.Getpid()) + caps, err := capability.NewPid2(0) if err != nil { t.Fatalf("error getting capabilities for pid %d: %v", pid, err) } diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index d736b80c51..4b0f3b188a 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -471,7 +471,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomma // Verify that all sentry threads are properly core tagged, and log // current core tag. - coreTags, err := coretag.GetAllCoreTags(os.Getpid()) + coreTags, err := coretag.GetAllCoreTags(0) if err != nil { util.Fatalf("Failed read current core tags: %v", err) } diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go index 00f0cf0a58..3de0b20125 100644 --- a/runsc/cmd/chroot.go +++ b/runsc/cmd/chroot.go @@ -82,6 +82,73 @@ func copyFile(dst, src string) error { return err } +// setupMinimalProcfs creates a minimal procfs-like tree at `${chroot}/proc`. +func setupMinimalProcfs(chroot string) error { + // We can't always directly mount procfs because it may be obstructed + // by submounts within it. See https://gvisor.dev/issue/10944. + // All we really need from procfs is /proc/self and a few kernel + // parameter files, which are typically not obstructed. + // So we create a tmpfs at /proc and manually copy the kernel parameter + // files into it. Then, to get /proc/self, we mount either a new + // instance of procfs (if possible), or a recursive bind mount of the + // procfs we do have access to (which still contains the obstructed + // submounts but /proc/self is not obstructed), and we symlink + // our /proc/self to the one in that mount. + // + // Why not try to mount the new procfs instance at /proc directly? + // Because that would cause the set of files at /proc to differ + // between the "new procfs instance" case and the "recursive bind + // mount" case. Thus, this could introduce a bug whereby gVisor starts + // to depend on a /proc file that is present in one case but not the + // other, without decent test coverage to catch it. + procRoot := filepath.Join(chroot, "/proc") + if err := os.Mkdir(procRoot, 0755); err != nil { + return fmt.Errorf("error creating /proc in chroot: %v", err) + } + if err := specutils.SafeMount("runsc-proc", procRoot, "tmpfs", + unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC, "", "/proc"); err != nil { + return fmt.Errorf("error mounting tmpfs in /proc: %v", err) + } + flags := uint32(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY) + procSubmountDir := "sandbox-proc" + if newProcfsErr := mountInChroot(chroot, "proc", "/proc/"+procSubmountDir, "proc", flags); newProcfsErr != nil { + log.Debugf("Unable to mount a new instance of the procfs file system at %q (%v); trying a recursive bind mount instead.", filepath.Join(procRoot, procSubmountDir), newProcfsErr) + procSubmountDir = "host-proc" + if bindErr := mountInChroot(chroot, "/proc", "/proc/"+procSubmountDir, "bind", + unix.MS_BIND|unix.MS_REC|flags); bindErr != nil { + return fmt.Errorf("error recursively bind-mounting proc at %q (%w) after also failing to mount a new procfs instance there (%v)", filepath.Join(procRoot, procSubmountDir), bindErr, newProcfsErr) + } + log.Debugf("Successfully mounted a recursive bind mount of procfs at %q; continuing.", filepath.Join(procRoot, procSubmountDir)) + } + // Create needed directories. + for _, d := range []string{ + "/proc/sys", + "/proc/sys/kernel", + "/proc/sys/vm", + } { + if err := os.Mkdir(filepath.Join(chroot, d), 0755); err != nil { + return fmt.Errorf("error creating directory %q: %v", filepath.Join(chroot, d), err) + } + } + // Copy needed files. + for _, f := range []string{ + "/proc/sys/vm/mmap_min_addr", + "/proc/sys/kernel/cap_last_cap", + } { + if err := copyFile(filepath.Join(chroot, f), f); err != nil { + return fmt.Errorf("failed to copy %q -> %q: %w", f, filepath.Join(chroot, f), err) + } + } + // Create symlink for /proc/self. + if err := os.Symlink(procSubmountDir+"/self", filepath.Join(procRoot, "self")); err != nil { + return fmt.Errorf("error creating symlink %q -> %q: %w", filepath.Join(procRoot, "self"), procSubmountDir+"/self", err) + } + if err := os.Chmod(procRoot, 0o111); err != nil { + return fmt.Errorf("error chmodding %q: %v", procRoot, err) + } + return nil +} + // setUpChroot creates an empty directory with runsc mounted at /runsc and proc // mounted at /proc. func setUpChroot(spec *specs.Spec, conf *config.Config) error { @@ -109,9 +176,8 @@ func setUpChroot(spec *specs.Spec, conf *config.Config) error { log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err) } - flags := uint32(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY) - if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil { - return fmt.Errorf("error mounting proc in chroot: %v", err) + if err := setupMinimalProcfs(chroot); err != nil { + return fmt.Errorf("error setting up minimal procfs in chroot %q: %v", chroot, err) } if err := tpuProxyUpdateChroot("/", chroot, spec, conf); err != nil { diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index abbfa6350b..bd3754b08a 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -214,7 +214,7 @@ func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { // HasCapabilities returns true if the user has all capabilities in 'cs'. func HasCapabilities(cs ...capability.Cap) bool { - caps, err := capability.NewPid2(os.Getpid()) + caps, err := capability.NewPid2(0) if err != nil { return false } diff --git a/test/e2e/runtime_in_docker_test.go b/test/e2e/runtime_in_docker_test.go index 97c7e5aa96..9924c2c1d2 100644 --- a/test/e2e/runtime_in_docker_test.go +++ b/test/e2e/runtime_in_docker_test.go @@ -74,18 +74,6 @@ func (test testVariant) run(ctx context.Context, logger testutil.Logger, runscPa ReadOnly: false, }) } - // Mount an unobstructed view of procfs at /proc2 so that the runtime - // can mount a fresh procfs. - // TODO(gvisor.dev/issue/10944): Remove this once issue is fixed. - opts.Mounts = append(opts.Mounts, mount.Mount{ - Type: mount.TypeBind, - Source: "/proc", - Target: "/proc2", - ReadOnly: false, - BindOptions: &mount.BindOptions{ - NonRecursive: true, - }, - }) const wantMessage = "It became a jumble of words, a litany, almost a kind of glossolalia." args := []string{ "/runtime",