From b6967fa84c26b3fc8a870d178ccdf8d40785894a Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 6 Apr 2022 20:47:25 -0700 Subject: [PATCH] Decouple cgroup devices handling This commit separates the functionality of setting cgroup device rules out of libct/cgroups to libct/cgroups/devices package. This package, if imported, sets the function variables in libct/cgroups and libct/cgroups/systemd, so that a cgroup manager can use those to manage devices. If those function variables are nil (when libct/cgroups/devices are not imported), a cgroup manager returns the ErrDevicesUnsupported in case any device rules are set in Resources. It also consolidates the code from libct/cgroups/ebpf and libct/cgroups/ebpf/devicefilter into libct/cgroups/devices. Moved some tests in libct/cg/sd that require device management to libct/sd/devices. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/cgroups.go | 15 ++ .../devicefilter => devices}/devicefilter.go | 9 +- .../devicefilter_test.go | 2 +- libcontainer/cgroups/devices/devices.go | 16 ++ .../cgroups/{ebpf => devices}/ebpf_linux.go | 2 +- libcontainer/cgroups/devices/systemd.go | 233 ++++++++++++++++ libcontainer/cgroups/devices/systemd_test.go | 253 ++++++++++++++++++ libcontainer/cgroups/devices/v1.go | 84 ++++++ .../devices_test.go => devices/v1_test.go} | 30 ++- .../cgroups/{fs2/devices.go => devices/v2.go} | 10 +- libcontainer/cgroups/fs/devices.go | 82 +----- libcontainer/cgroups/fs/fs.go | 2 +- libcontainer/cgroups/fs2/fs2.go | 16 +- libcontainer/cgroups/systemd/common.go | 16 ++ libcontainer/cgroups/systemd/devices.go | 219 --------------- .../{devices_test.go => freeze_test.go} | 226 +--------------- libcontainer/factory_linux.go | 2 + libcontainer/integration/init_test.go | 2 + 18 files changed, 674 insertions(+), 545 deletions(-) rename libcontainer/cgroups/{ebpf/devicefilter => devices}/devicefilter.go (95%) rename libcontainer/cgroups/{ebpf/devicefilter => devices}/devicefilter_test.go (99%) create mode 100644 libcontainer/cgroups/devices/devices.go rename libcontainer/cgroups/{ebpf => devices}/ebpf_linux.go (99%) create mode 100644 libcontainer/cgroups/devices/systemd.go create mode 100644 libcontainer/cgroups/devices/systemd_test.go create mode 100644 libcontainer/cgroups/devices/v1.go rename libcontainer/cgroups/{fs/devices_test.go => devices/v1_test.go} (60%) rename libcontainer/cgroups/{fs2/devices.go => devices/v2.go} (83%) rename libcontainer/cgroups/systemd/{devices_test.go => freeze_test.go} (57%) diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index ba2b2266c9d..b9ba889b7a0 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -1,9 +1,24 @@ package cgroups import ( + "errors" + "github.com/opencontainers/runc/libcontainer/configs" ) +var ( + // ErrDevicesUnsupported is an error returned when a cgroup manager + // is not configured to set device rules. + ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules") + + // DevicesSetV1 and DevicesSetV2 are functions to set devices for + // cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices + // package is imported, it is set to nil, so cgroup managers can't + // manage devices. + DevicesSetV1 func(path string, r *configs.Resources) error + DevicesSetV2 func(path string, r *configs.Resources) error +) + type Manager interface { // Apply creates a cgroup, if not yet created, and adds a process // with the specified pid into that cgroup. A special value of -1 diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/libcontainer/cgroups/devices/devicefilter.go similarity index 95% rename from libcontainer/cgroups/ebpf/devicefilter/devicefilter.go rename to libcontainer/cgroups/devices/devicefilter.go index 4e69b35bcda..3849810b30e 100644 --- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go +++ b/libcontainer/cgroups/devices/devicefilter.go @@ -1,10 +1,10 @@ -// Package devicefilter contains eBPF device filter program +// Implements creation of eBPF device filter program. // -// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// Based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c // // Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) // agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 -package devicefilter +package devices import ( "errors" @@ -13,7 +13,6 @@ import ( "strconv" "github.com/cilium/ebpf/asm" - devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/devices" "golang.org/x/sys/unix" ) @@ -30,7 +29,7 @@ func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) { // gives us a guarantee that the behaviour of devices filtering is the same // as cgroupv1, including security hardenings to avoid misconfiguration // (such as punching holes in wildcard rules). - emu := new(devicesemulator.Emulator) + emu := new(Emulator) for _, rule := range rules { if err := emu.Apply(*rule); err != nil { return nil, "", err diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go b/libcontainer/cgroups/devices/devicefilter_test.go similarity index 99% rename from libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go rename to libcontainer/cgroups/devices/devicefilter_test.go index 25703be5ad7..7cabf0673f5 100644 --- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go +++ b/libcontainer/cgroups/devices/devicefilter_test.go @@ -1,4 +1,4 @@ -package devicefilter +package devices import ( "strings" diff --git a/libcontainer/cgroups/devices/devices.go b/libcontainer/cgroups/devices/devices.go new file mode 100644 index 00000000000..844d0563b59 --- /dev/null +++ b/libcontainer/cgroups/devices/devices.go @@ -0,0 +1,16 @@ +// Package devices contains functionality to manage cgroup devices, which +// is exposed indirectly via libcontainer/cgroups managers. +// +// To enable cgroup managers to manage devices, this package must be imported. +package devices + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" +) + +func init() { + cgroups.DevicesSetV1 = setV1 + cgroups.DevicesSetV2 = setV2 + systemd.GenerateDeviceProps = systemdProperties +} diff --git a/libcontainer/cgroups/ebpf/ebpf_linux.go b/libcontainer/cgroups/devices/ebpf_linux.go similarity index 99% rename from libcontainer/cgroups/ebpf/ebpf_linux.go rename to libcontainer/cgroups/devices/ebpf_linux.go index 104c74a890f..77277f93bb7 100644 --- a/libcontainer/cgroups/ebpf/ebpf_linux.go +++ b/libcontainer/cgroups/devices/ebpf_linux.go @@ -1,4 +1,4 @@ -package ebpf +package devices import ( "errors" diff --git a/libcontainer/cgroups/devices/systemd.go b/libcontainer/cgroups/devices/systemd.go new file mode 100644 index 00000000000..02962540058 --- /dev/null +++ b/libcontainer/cgroups/devices/systemd.go @@ -0,0 +1,233 @@ +package devices + +import ( + "bufio" + "errors" + "fmt" + "os" + "strings" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" +) + +// systemdProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func systemdProperties(r *configs.Resources) ([]systemdDbus.Property, error) { + if r.SkipDevices { + return nil, nil + } + + properties := []systemdDbus.Property{ + // Always run in the strictest white-list mode. + newProp("DevicePolicy", "strict"), + // Empty the DeviceAllow array before filling it. + newProp("DeviceAllow", []deviceAllowEntry{}), + } + + // Figure out the set of rules. + configEmu := Emulator{} + for _, rule := range r.Devices { + if err := configEmu.Apply(*rule); err != nil { + return nil, fmt.Errorf("unable to apply rule for systemd: %w", err) + } + } + // systemd doesn't support blacklists. So we log a warning, and tell + // systemd to act as a deny-all whitelist. This ruleset will be replaced + // with our normal fallback code. This may result in spurious errors, but + // the only other option is to error out here. + if configEmu.IsBlacklist() { + // However, if we're dealing with an allow-all rule then we can do it. + if configEmu.IsAllowAll() { + return allowAllDevices(), nil + } + logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") + return properties, nil + } + + // Now generate the set of rules we actually need to apply. Unlike the + // normal devices cgroup, in "strict" mode systemd defaults to a deny-all + // whitelist which is the default for devices.Emulator. + finalRules, err := configEmu.Rules() + if err != nil { + return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err) + } + var deviceAllowList []deviceAllowEntry + for _, rule := range finalRules { + if !rule.Allow { + // Should never happen. + return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) + } + switch rule.Type { + case devices.BlockDevice, devices.CharDevice: + default: + // Should never happen. + return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type) + } + + entry := deviceAllowEntry{ + Perms: string(rule.Permissions), + } + + // systemd has a fairly odd (though understandable) syntax here, and + // because of the OCI configuration format we have to do quite a bit of + // trickery to convert things: + // + // * Concrete rules with non-wildcard major/minor numbers have to use + // /dev/{block,char} paths. This is slightly odd because it means + // that we cannot add whitelist rules for devices that don't exist, + // but there's not too much we can do about that. + // + // However, path globbing is not support for path-based rules so we + // need to handle wildcards in some other manner. + // + // * Wildcard-minor rules have to specify a "device group name" (the + // second column in /proc/devices). + // + // * Wildcard (major and minor) rules can just specify a glob with the + // type ("char-*" or "block-*"). + // + // The only type of rule we can't handle is wildcard-major rules, and + // so we'll give a warning in that case (note that the fallback code + // will insert any rules systemd couldn't handle). What amazing fun. + + if rule.Major == devices.Wildcard { + // "_ *:n _" rules aren't supported by systemd. + if rule.Minor != devices.Wildcard { + logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + continue + } + + // "_ *:* _" rules just wildcard everything. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + "*" + } else if rule.Minor == devices.Wildcard { + // "_ n:* _" rules require a device group from /proc/devices. + group, err := findDeviceGroup(rule.Type, rule.Major) + if err != nil { + return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err) + } + if group == "" { + // Couldn't find a group. + logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + continue + } + entry.Path = group + } else { + // "_ n:m _" rules are just a path in /dev/{block,char}/. + switch rule.Type { + case devices.BlockDevice: + entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) + case devices.CharDevice: + entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) + } + } + deviceAllowList = append(deviceAllowList, entry) + } + + properties = append(properties, newProp("DeviceAllow", deviceAllowList)) + return properties, nil +} + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func groupPrefix(ruleType devices.Type) (string, error) { + switch ruleType { + case devices.BlockDevice: + return "block-", nil + case devices.CharDevice: + return "char-", nil + default: + return "", fmt.Errorf("device type %v has no group prefix", ruleType) + } +} + +// findDeviceGroup tries to find the device group name (as listed in +// /proc/devices) with the type prefixed as required for DeviceAllow, for a +// given (type, major) combination. If more than one device group exists, an +// arbitrary one is chosen. +func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) { + fh, err := os.Open("/proc/devices") + if err != nil { + return "", err + } + defer fh.Close() + + prefix, err := groupPrefix(ruleType) + if err != nil { + return "", err + } + + scanner := bufio.NewScanner(fh) + var currentType devices.Type + for scanner.Scan() { + // We need to strip spaces because the first number is column-aligned. + line := strings.TrimSpace(scanner.Text()) + + // Handle the "header" lines. + switch line { + case "Block devices:": + currentType = devices.BlockDevice + continue + case "Character devices:": + currentType = devices.CharDevice + continue + case "": + continue + } + + // Skip lines unrelated to our type. + if currentType != ruleType { + continue + } + + // Parse out the (major, name). + var ( + currMajor int64 + currName string + ) + if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { + if err == nil { + err = errors.New("wrong number of fields") + } + return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err) + } + + if currMajor == ruleMajor { + return prefix + currName, nil + } + } + if err := scanner.Err(); err != nil { + return "", fmt.Errorf("reading /proc/devices: %w", err) + } + // Couldn't find the device group. + return "", nil +} + +// DeviceAllow is the dbus type "a(ss)" which means we need a struct +// to represent it in Go. +type deviceAllowEntry struct { + Path string + Perms string +} + +func allowAllDevices() []systemdDbus.Property { + // Setting mode to auto and removing all DeviceAllow rules + // results in allowing access to all devices. + return []systemdDbus.Property{ + newProp("DevicePolicy", "auto"), + newProp("DeviceAllow", []deviceAllowEntry{}), + } +} diff --git a/libcontainer/cgroups/devices/systemd_test.go b/libcontainer/cgroups/devices/systemd_test.go new file mode 100644 index 00000000000..e20b57c5d40 --- /dev/null +++ b/libcontainer/cgroups/devices/systemd_test.go @@ -0,0 +1,253 @@ +package devices + +import ( + "bytes" + "os" + "os/exec" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" +) + +// TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true +// does not result in spurious "permission denied" errors in a container +// running under the pod. The test is somewhat similar in nature to the +// @test "update devices [minimal transition rules]" in tests/integration, +// but uses a pod. +func TestPodSkipDevicesUpdate(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podName := "system-runc_test_pod" + t.Name() + ".slice" + podConfig := &configs.Cgroup{ + Systemd: true, + Parent: "system.slice", + Name: podName, + Resources: &configs.Resources{ + PidsLimit: 42, + Memory: 32 * 1024 * 1024, + SkipDevices: true, + }, + } + // Create "pod" cgroup (a systemd slice to hold containers). + pm := newManager(t, podConfig) + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + containerConfig := &configs.Cgroup{ + Parent: podName, + ScopePrefix: "test", + Name: "PodSkipDevicesUpdate", + Resources: &configs.Resources{ + Devices: []*devices.Rule{ + // Allow access to /dev/null. + { + Type: devices.CharDevice, + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + }, + }, + } + + // Create a "container" within the "pod" cgroup. + // This is not a real container, just a process in the cgroup. + cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done") + cmd.Env = append(os.Environ(), "LANG=C") + var stderr bytes.Buffer + cmd.Stderr = &stderr + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + cm := newManager(t, containerConfig) + if err := cm.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + cm.Path("devices"), pm.Path("devices")) + } + if err := cm.Set(containerConfig.Resources); err != nil { + t.Fatal(err) + } + + // Now update the pod a few times. + for i := 0; i < 42; i++ { + podConfig.Resources.PidsLimit++ + podConfig.Resources.Memory += 1024 * 1024 + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + } + // Kill the "container". + if err := cmd.Process.Kill(); err != nil { + t.Fatal(err) + } + + _ = cmd.Wait() + + // "Container" stderr should be empty. + if stderr.Len() != 0 { + t.Fatalf("container stderr not empty: %s", stderr.String()) + } +} + +func testSkipDevices(t *testing.T, skipDevices bool, expected []string) { + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podConfig := &configs.Cgroup{ + Parent: "system.slice", + Name: "system-runc_test_pods.slice", + Resources: &configs.Resources{ + SkipDevices: skipDevices, + }, + } + // Create "pods" cgroup (a systemd slice to hold containers). + pm := newManager(t, podConfig) + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + config := &configs.Cgroup{ + Parent: "system-runc_test_pods.slice", + ScopePrefix: "test", + Name: "SkipDevices", + Resources: &configs.Resources{ + Devices: []*devices.Rule{ + // Allow access to /dev/full only. + { + Type: devices.CharDevice, + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + }, + }, + } + + // Create a "container" within the "pods" cgroup. + // This is not a real container, just a process in the cgroup. + cmd := exec.Command("bash", "-c", "read; echo > /dev/full; cat /dev/null; true") + cmd.Env = append(os.Environ(), "LANG=C") + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + cmd.Stdin = stdinR + var stderr bytes.Buffer + cmd.Stderr = &stderr + err = cmd.Start() + stdinR.Close() + defer stdinW.Close() + if err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _, _ = stdinW.WriteString("hey\n") + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + m := newManager(t, config) + if err := m.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(m.Path("devices"), pm.Path("devices")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + m.Path("devices"), pm.Path("devices")) + } + if err := m.Set(config.Resources); err != nil { + // failed to write "c 1:7 rwm": write /sys/fs/cgroup/devices/system.slice/system-runc_test_pods.slice/test-SkipDevices.scope/devices.allow: operation not permitted + if skipDevices == false && strings.HasSuffix(err.Error(), "/devices.allow: operation not permitted") { + // Cgroup v1 devices controller gives EPERM on trying + // to enable devices that are not enabled + // (skipDevices=false) in a parent cgroup. + // If this happens, test is passing. + return + } + t.Fatal(err) + } + + // Check that we can access /dev/full but not /dev/zero. + if _, err := stdinW.WriteString("wow\n"); err != nil { + t.Fatal(err) + } + if err := cmd.Wait(); err != nil { + t.Fatal(err) + } + for _, exp := range expected { + if !strings.Contains(stderr.String(), exp) { + t.Errorf("expected %q, got: %s", exp, stderr.String()) + } + } +} + +func TestSkipDevicesTrue(t *testing.T) { + testSkipDevices(t, true, []string{ + "echo: write error: No space left on device", + "cat: /dev/null: Operation not permitted", + }) +} + +func TestSkipDevicesFalse(t *testing.T) { + // If SkipDevices is not set for the parent slice, access to both + // devices should fail. This is done to assess the test correctness. + // For cgroup v1, we check for m.Set returning EPERM. + // For cgroup v2, we check for the errors below. + testSkipDevices(t, false, []string{ + "/dev/full: Operation not permitted", + "cat: /dev/null: Operation not permitted", + }) +} + +func newManager(t *testing.T, config *configs.Cgroup) (m cgroups.Manager) { + t.Helper() + var err error + + if cgroups.IsCgroup2UnifiedMode() { + m, err = systemd.NewUnifiedManager(config, "") + } else { + m, err = systemd.NewLegacyManager(config, nil) + } + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = m.Destroy() }) + + return m +} diff --git a/libcontainer/cgroups/devices/v1.go b/libcontainer/cgroups/devices/v1.go new file mode 100644 index 00000000000..64eadd12f11 --- /dev/null +++ b/libcontainer/cgroups/devices/v1.go @@ -0,0 +1,84 @@ +package devices + +import ( + "bytes" + "errors" + "reflect" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/userns" +) + +var testingSkipFinalCheck bool + +func setV1(path string, r *configs.Resources) error { + if userns.RunningInUserNS() || r.SkipDevices { + return nil + } + // Generate two emulators, one for the current state of the cgroup and one + // for the requested state by the user. + current, err := loadEmulator(path) + if err != nil { + return err + } + target, err := buildEmulator(r.Devices) + if err != nil { + return err + } + + // Compute the minimal set of transition rules needed to achieve the + // requested state. + transitionRules, err := current.Transition(target) + if err != nil { + return err + } + for _, rule := range transitionRules { + file := "devices.deny" + if rule.Allow { + file = "devices.allow" + } + if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil { + return err + } + } + + // Final safety check -- ensure that the resulting state is what was + // requested. This is only really correct for white-lists, but for + // black-lists we can at least check that the cgroup is in the right mode. + // + // This safety-check is skipped for the unit tests because we cannot + // currently mock devices.list correctly. + if !testingSkipFinalCheck { + currentAfter, err := loadEmulator(path) + if err != nil { + return err + } + if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { + return errors.New("resulting devices cgroup doesn't precisely match target") + } else if target.IsBlacklist() != currentAfter.IsBlacklist() { + return errors.New("resulting devices cgroup doesn't match target mode") + } + } + return nil +} + +func loadEmulator(path string) (*Emulator, error) { + list, err := cgroups.ReadFile(path, "devices.list") + if err != nil { + return nil, err + } + return EmulatorFromList(bytes.NewBufferString(list)) +} + +func buildEmulator(rules []*devices.Rule) (*Emulator, error) { + // This defaults to a white-list -- which is what we want! + emu := &Emulator{} + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, err + } + } + return emu, nil +} diff --git a/libcontainer/cgroups/fs/devices_test.go b/libcontainer/cgroups/devices/v1_test.go similarity index 60% rename from libcontainer/cgroups/fs/devices_test.go rename to libcontainer/cgroups/devices/v1_test.go index bdd1967836e..1d1c4c3770a 100644 --- a/libcontainer/cgroups/fs/devices_test.go +++ b/libcontainer/cgroups/devices/v1_test.go @@ -1,21 +1,34 @@ -package fs +package devices import ( + "os" + "path" "testing" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" ) -func TestDevicesSetAllow(t *testing.T) { - path := tempDir(t, "devices") +func init() { + testingSkipFinalCheck = true + cgroups.TestMode = true +} + +func TestSetV1Allow(t *testing.T) { + dir := t.TempDir() - writeFileContents(t, path, map[string]string{ + for file, contents := range map[string]string{ "devices.allow": "", "devices.deny": "", "devices.list": "a *:* rwm", - }) + } { + err := os.WriteFile(path.Join(dir, file), []byte(contents), 0o600) + if err != nil { + t.Fatal(err) + } + } r := &configs.Resources{ Devices: []*devices.Rule{ @@ -29,13 +42,12 @@ func TestDevicesSetAllow(t *testing.T) { }, } - d := &DevicesGroup{TestingSkipFinalCheck: true} - if err := d.Set(path, r); err != nil { + if err := setV1(dir, r); err != nil { t.Fatal(err) } // The default deny rule must be written. - value, err := fscommon.GetCgroupParamString(path, "devices.deny") + value, err := fscommon.GetCgroupParamString(dir, "devices.deny") if err != nil { t.Fatal(err) } @@ -44,7 +56,7 @@ func TestDevicesSetAllow(t *testing.T) { } // Permitted rule must be written. - if value, err := fscommon.GetCgroupParamString(path, "devices.allow"); err != nil { + if value, err := fscommon.GetCgroupParamString(dir, "devices.allow"); err != nil { t.Fatal(err) } else if value != "c 1:5 rwm" { t.Errorf("Got the wrong value (%q), set devices.allow failed.", value) diff --git a/libcontainer/cgroups/fs2/devices.go b/libcontainer/cgroups/devices/v2.go similarity index 83% rename from libcontainer/cgroups/fs2/devices.go rename to libcontainer/cgroups/devices/v2.go index 0d23456072c..a1b3f3a5a5e 100644 --- a/libcontainer/cgroups/fs2/devices.go +++ b/libcontainer/cgroups/devices/v2.go @@ -1,12 +1,10 @@ -package fs2 +package devices import ( "fmt" "golang.org/x/sys/unix" - "github.com/opencontainers/runc/libcontainer/cgroups/ebpf" - "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/userns" @@ -53,11 +51,11 @@ func canSkipEBPFError(r *configs.Resources) bool { return true } -func setDevices(dirPath string, r *configs.Resources) error { +func setV2(dirPath string, r *configs.Resources) error { if r.SkipDevices { return nil } - insts, license, err := devicefilter.DeviceFilter(r.Devices) + insts, license, err := DeviceFilter(r.Devices) if err != nil { return err } @@ -66,7 +64,7 @@ func setDevices(dirPath string, r *configs.Resources) error { return fmt.Errorf("cannot get dir FD for %s", dirPath) } defer unix.Close(dirFD) - if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { if !canSkipEBPFError(r) { return err } diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go index 4527a70ebfc..0bf3d9debb9 100644 --- a/libcontainer/cgroups/fs/devices.go +++ b/libcontainer/cgroups/fs/devices.go @@ -1,20 +1,11 @@ package fs import ( - "bytes" - "errors" - "reflect" - "github.com/opencontainers/runc/libcontainer/cgroups" - cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" - "github.com/opencontainers/runc/libcontainer/userns" ) -type DevicesGroup struct { - TestingSkipFinalCheck bool -} +type DevicesGroup struct{} func (s *DevicesGroup) Name() string { return "devices" @@ -33,75 +24,14 @@ func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error { return apply(path, pid) } -func loadEmulator(path string) (*cgroupdevices.Emulator, error) { - list, err := cgroups.ReadFile(path, "devices.list") - if err != nil { - return nil, err - } - return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list)) -} - -func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) { - // This defaults to a white-list -- which is what we want! - emu := &cgroupdevices.Emulator{} - for _, rule := range rules { - if err := emu.Apply(*rule); err != nil { - return nil, err - } - } - return emu, nil -} - func (s *DevicesGroup) Set(path string, r *configs.Resources) error { - if userns.RunningInUserNS() || r.SkipDevices { - return nil - } - - // Generate two emulators, one for the current state of the cgroup and one - // for the requested state by the user. - current, err := loadEmulator(path) - if err != nil { - return err - } - target, err := buildEmulator(r.Devices) - if err != nil { - return err - } - - // Compute the minimal set of transition rules needed to achieve the - // requested state. - transitionRules, err := current.Transition(target) - if err != nil { - return err - } - for _, rule := range transitionRules { - file := "devices.deny" - if rule.Allow { - file = "devices.allow" - } - if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil { - return err + if cgroups.DevicesSetV1 == nil { + if len(r.Devices) == 0 { + return nil } + return cgroups.ErrDevicesUnsupported } - - // Final safety check -- ensure that the resulting state is what was - // requested. This is only really correct for white-lists, but for - // black-lists we can at least check that the cgroup is in the right mode. - // - // This safety-check is skipped for the unit tests because we cannot - // currently mock devices.list correctly. - if !s.TestingSkipFinalCheck { - currentAfter, err := loadEmulator(path) - if err != nil { - return err - } - if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { - return errors.New("resulting devices cgroup doesn't precisely match target") - } else if target.IsBlacklist() != currentAfter.IsBlacklist() { - return errors.New("resulting devices cgroup doesn't match target mode") - } - } - return nil + return cgroups.DevicesSetV1(path, r) } func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index fb4fcc7f75b..be4dcc341bb 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -182,7 +182,7 @@ func (m *manager) Set(r *configs.Resources) error { if err := sys.Set(path, r); err != nil { // When rootless is true, errors from the device subsystem // are ignored, as it is really not expected to work. - if m.cgroups.Rootless && sys.Name() == "devices" { + if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) { continue } // However, errors from other subsystems are not ignored. diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 492778e3105..d5208d7782f 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -175,8 +175,10 @@ func (m *manager) Set(r *configs.Resources) error { // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. // However, errors from other subsystems are not ignored. // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" - if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless { - return err + if err := setDevices(m.dirPath, r); err != nil { + if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) { + return err + } } // cpuset (since kernel 5.0) if err := setCpuset(m.dirPath, r); err != nil { @@ -201,6 +203,16 @@ func (m *manager) Set(r *configs.Resources) error { return nil } +func setDevices(dirPath string, r *configs.Resources) error { + if cgroups.DevicesSetV2 == nil { + if len(r.Devices) > 0 { + return cgroups.ErrDevicesUnsupported + } + return nil + } + return cgroups.DevicesSetV2(dirPath, r) +} + func (m *manager) setUnified(res map[string]string) error { for k, v := range res { if strings.Contains(k, "/") { diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index ddb9c44dc10..0c3562e6bfe 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -16,6 +16,7 @@ import ( dbus "github.com/godbus/dbus/v5" "github.com/sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" ) @@ -32,6 +33,8 @@ var ( isRunningSystemdOnce sync.Once isRunningSystemd bool + + GenerateDeviceProps func(*configs.Resources) ([]systemdDbus.Property, error) ) // NOTE: This function comes from package github.com/coreos/go-systemd/util @@ -313,3 +316,16 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st } return nil } + +// generateDeviceProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) { + if GenerateDeviceProps == nil { + if len(r.Devices) > 0 { + return nil, cgroups.ErrDevicesUnsupported + } + return nil, nil + } + + return GenerateDeviceProps(r) +} diff --git a/libcontainer/cgroups/systemd/devices.go b/libcontainer/cgroups/systemd/devices.go index 221c978942c..edd1f158568 100644 --- a/libcontainer/cgroups/systemd/devices.go +++ b/libcontainer/cgroups/systemd/devices.go @@ -1,20 +1,11 @@ package systemd import ( - "bufio" - "errors" - "fmt" - "os" "reflect" - "strings" - systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" - "github.com/sirupsen/logrus" - cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" ) // freezeBeforeSet answers whether there is a need to freeze the cgroup before @@ -81,213 +72,3 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) ( } return } - -func groupPrefix(ruleType devices.Type) (string, error) { - switch ruleType { - case devices.BlockDevice: - return "block-", nil - case devices.CharDevice: - return "char-", nil - default: - return "", fmt.Errorf("device type %v has no group prefix", ruleType) - } -} - -// findDeviceGroup tries to find the device group name (as listed in -// /proc/devices) with the type prefixed as required for DeviceAllow, for a -// given (type, major) combination. If more than one device group exists, an -// arbitrary one is chosen. -func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) { - fh, err := os.Open("/proc/devices") - if err != nil { - return "", err - } - defer fh.Close() - - prefix, err := groupPrefix(ruleType) - if err != nil { - return "", err - } - - scanner := bufio.NewScanner(fh) - var currentType devices.Type - for scanner.Scan() { - // We need to strip spaces because the first number is column-aligned. - line := strings.TrimSpace(scanner.Text()) - - // Handle the "header" lines. - switch line { - case "Block devices:": - currentType = devices.BlockDevice - continue - case "Character devices:": - currentType = devices.CharDevice - continue - case "": - continue - } - - // Skip lines unrelated to our type. - if currentType != ruleType { - continue - } - - // Parse out the (major, name). - var ( - currMajor int64 - currName string - ) - if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { - if err == nil { - err = errors.New("wrong number of fields") - } - return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err) - } - - if currMajor == ruleMajor { - return prefix + currName, nil - } - } - if err := scanner.Err(); err != nil { - return "", fmt.Errorf("reading /proc/devices: %w", err) - } - // Couldn't find the device group. - return "", nil -} - -// DeviceAllow is the dbus type "a(ss)" which means we need a struct -// to represent it in Go. -type deviceAllowEntry struct { - Path string - Perms string -} - -func allowAllDevices() []systemdDbus.Property { - // Setting mode to auto and removing all DeviceAllow rules - // results in allowing access to all devices. - return []systemdDbus.Property{ - newProp("DevicePolicy", "auto"), - newProp("DeviceAllow", []deviceAllowEntry{}), - } -} - -// generateDeviceProperties takes the configured device rules and generates a -// corresponding set of systemd properties to configure the devices correctly. -func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) { - if r.SkipDevices { - return nil, nil - } - - properties := []systemdDbus.Property{ - // Always run in the strictest white-list mode. - newProp("DevicePolicy", "strict"), - // Empty the DeviceAllow array before filling it. - newProp("DeviceAllow", []deviceAllowEntry{}), - } - - // Figure out the set of rules. - configEmu := &cgroupdevices.Emulator{} - for _, rule := range r.Devices { - if err := configEmu.Apply(*rule); err != nil { - return nil, fmt.Errorf("unable to apply rule for systemd: %w", err) - } - } - // systemd doesn't support blacklists. So we log a warning, and tell - // systemd to act as a deny-all whitelist. This ruleset will be replaced - // with our normal fallback code. This may result in spurious errors, but - // the only other option is to error out here. - if configEmu.IsBlacklist() { - // However, if we're dealing with an allow-all rule then we can do it. - if configEmu.IsAllowAll() { - return allowAllDevices(), nil - } - logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") - return properties, nil - } - - // Now generate the set of rules we actually need to apply. Unlike the - // normal devices cgroup, in "strict" mode systemd defaults to a deny-all - // whitelist which is the default for devices.Emulator. - finalRules, err := configEmu.Rules() - if err != nil { - return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err) - } - var deviceAllowList []deviceAllowEntry - for _, rule := range finalRules { - if !rule.Allow { - // Should never happen. - return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) - } - switch rule.Type { - case devices.BlockDevice, devices.CharDevice: - default: - // Should never happen. - return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type) - } - - entry := deviceAllowEntry{ - Perms: string(rule.Permissions), - } - - // systemd has a fairly odd (though understandable) syntax here, and - // because of the OCI configuration format we have to do quite a bit of - // trickery to convert things: - // - // * Concrete rules with non-wildcard major/minor numbers have to use - // /dev/{block,char} paths. This is slightly odd because it means - // that we cannot add whitelist rules for devices that don't exist, - // but there's not too much we can do about that. - // - // However, path globbing is not support for path-based rules so we - // need to handle wildcards in some other manner. - // - // * Wildcard-minor rules have to specify a "device group name" (the - // second column in /proc/devices). - // - // * Wildcard (major and minor) rules can just specify a glob with the - // type ("char-*" or "block-*"). - // - // The only type of rule we can't handle is wildcard-major rules, and - // so we'll give a warning in that case (note that the fallback code - // will insert any rules systemd couldn't handle). What amazing fun. - - if rule.Major == devices.Wildcard { - // "_ *:n _" rules aren't supported by systemd. - if rule.Minor != devices.Wildcard { - logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) - continue - } - - // "_ *:* _" rules just wildcard everything. - prefix, err := groupPrefix(rule.Type) - if err != nil { - return nil, err - } - entry.Path = prefix + "*" - } else if rule.Minor == devices.Wildcard { - // "_ n:* _" rules require a device group from /proc/devices. - group, err := findDeviceGroup(rule.Type, rule.Major) - if err != nil { - return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err) - } - if group == "" { - // Couldn't find a group. - logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) - continue - } - entry.Path = group - } else { - // "_ n:m _" rules are just a path in /dev/{block,char}/. - switch rule.Type { - case devices.BlockDevice: - entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) - case devices.CharDevice: - entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) - } - } - deviceAllowList = append(deviceAllowList, entry) - } - - properties = append(properties, newProp("DeviceAllow", deviceAllowList)) - return properties, nil -} diff --git a/libcontainer/cgroups/systemd/devices_test.go b/libcontainer/cgroups/systemd/freeze_test.go similarity index 57% rename from libcontainer/cgroups/systemd/devices_test.go rename to libcontainer/cgroups/systemd/freeze_test.go index 74ce2572b1e..76057865433 100644 --- a/libcontainer/cgroups/systemd/devices_test.go +++ b/libcontainer/cgroups/systemd/freeze_test.go @@ -8,235 +8,11 @@ import ( "strings" "testing" - "golang.org/x/sys/unix" - "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" + "golang.org/x/sys/unix" ) -// TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true -// does not result in spurious "permission denied" errors in a container -// running under the pod. The test is somewhat similar in nature to the -// @test "update devices [minimal transition rules]" in tests/integration, -// but uses a pod. -func TestPodSkipDevicesUpdate(t *testing.T) { - if !IsRunningSystemd() { - t.Skip("Test requires systemd.") - } - if os.Geteuid() != 0 { - t.Skip("Test requires root.") - } - - podName := "system-runc_test_pod" + t.Name() + ".slice" - podConfig := &configs.Cgroup{ - Systemd: true, - Parent: "system.slice", - Name: podName, - Resources: &configs.Resources{ - PidsLimit: 42, - Memory: 32 * 1024 * 1024, - SkipDevices: true, - }, - } - // Create "pod" cgroup (a systemd slice to hold containers). - pm := newManager(t, podConfig) - if err := pm.Apply(-1); err != nil { - t.Fatal(err) - } - if err := pm.Set(podConfig.Resources); err != nil { - t.Fatal(err) - } - - containerConfig := &configs.Cgroup{ - Parent: podName, - ScopePrefix: "test", - Name: "PodSkipDevicesUpdate", - Resources: &configs.Resources{ - Devices: []*devices.Rule{ - // Allow access to /dev/null. - { - Type: devices.CharDevice, - Major: 1, - Minor: 3, - Permissions: "rwm", - Allow: true, - }, - }, - }, - } - - // Create a "container" within the "pod" cgroup. - // This is not a real container, just a process in the cgroup. - cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done") - cmd.Env = append(os.Environ(), "LANG=C") - var stderr bytes.Buffer - cmd.Stderr = &stderr - if err := cmd.Start(); err != nil { - t.Fatal(err) - } - // Make sure to not leave a zombie. - defer func() { - // These may fail, we don't care. - _ = cmd.Process.Kill() - _ = cmd.Wait() - }() - - // Put the process into a cgroup. - cm := newManager(t, containerConfig) - if err := cm.Apply(cmd.Process.Pid); err != nil { - t.Fatal(err) - } - // Check that we put the "container" into the "pod" cgroup. - if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) { - t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", - cm.Path("devices"), pm.Path("devices")) - } - if err := cm.Set(containerConfig.Resources); err != nil { - t.Fatal(err) - } - - // Now update the pod a few times. - for i := 0; i < 42; i++ { - podConfig.Resources.PidsLimit++ - podConfig.Resources.Memory += 1024 * 1024 - if err := pm.Set(podConfig.Resources); err != nil { - t.Fatal(err) - } - } - // Kill the "container". - if err := cmd.Process.Kill(); err != nil { - t.Fatal(err) - } - - _ = cmd.Wait() - - // "Container" stderr should be empty. - if stderr.Len() != 0 { - t.Fatalf("container stderr not empty: %s", stderr.String()) - } -} - -func testSkipDevices(t *testing.T, skipDevices bool, expected []string) { - if !IsRunningSystemd() { - t.Skip("Test requires systemd.") - } - if os.Geteuid() != 0 { - t.Skip("Test requires root.") - } - - podConfig := &configs.Cgroup{ - Parent: "system.slice", - Name: "system-runc_test_pods.slice", - Resources: &configs.Resources{ - SkipDevices: skipDevices, - }, - } - // Create "pods" cgroup (a systemd slice to hold containers). - pm := newManager(t, podConfig) - if err := pm.Apply(-1); err != nil { - t.Fatal(err) - } - if err := pm.Set(podConfig.Resources); err != nil { - t.Fatal(err) - } - - config := &configs.Cgroup{ - Parent: "system-runc_test_pods.slice", - ScopePrefix: "test", - Name: "SkipDevices", - Resources: &configs.Resources{ - Devices: []*devices.Rule{ - // Allow access to /dev/full only. - { - Type: devices.CharDevice, - Major: 1, - Minor: 7, - Permissions: "rwm", - Allow: true, - }, - }, - }, - } - - // Create a "container" within the "pods" cgroup. - // This is not a real container, just a process in the cgroup. - cmd := exec.Command("bash", "-c", "read; echo > /dev/full; cat /dev/null; true") - cmd.Env = append(os.Environ(), "LANG=C") - stdinR, stdinW, err := os.Pipe() - if err != nil { - t.Fatal(err) - } - cmd.Stdin = stdinR - var stderr bytes.Buffer - cmd.Stderr = &stderr - err = cmd.Start() - stdinR.Close() - defer stdinW.Close() - if err != nil { - t.Fatal(err) - } - // Make sure to not leave a zombie. - defer func() { - // These may fail, we don't care. - _, _ = stdinW.WriteString("hey\n") - _ = cmd.Wait() - }() - - // Put the process into a cgroup. - m := newManager(t, config) - if err := m.Apply(cmd.Process.Pid); err != nil { - t.Fatal(err) - } - // Check that we put the "container" into the "pod" cgroup. - if !strings.HasPrefix(m.Path("devices"), pm.Path("devices")) { - t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", - m.Path("devices"), pm.Path("devices")) - } - if err := m.Set(config.Resources); err != nil { - // failed to write "c 1:7 rwm": write /sys/fs/cgroup/devices/system.slice/system-runc_test_pods.slice/test-SkipDevices.scope/devices.allow: operation not permitted - if skipDevices == false && strings.HasSuffix(err.Error(), "/devices.allow: operation not permitted") { - // Cgroup v1 devices controller gives EPERM on trying - // to enable devices that are not enabled - // (skipDevices=false) in a parent cgroup. - // If this happens, test is passing. - return - } - t.Fatal(err) - } - - // Check that we can access /dev/full but not /dev/zero. - if _, err := stdinW.WriteString("wow\n"); err != nil { - t.Fatal(err) - } - if err := cmd.Wait(); err != nil { - t.Fatal(err) - } - for _, exp := range expected { - if !strings.Contains(stderr.String(), exp) { - t.Errorf("expected %q, got: %s", exp, stderr.String()) - } - } -} - -func TestSkipDevicesTrue(t *testing.T) { - testSkipDevices(t, true, []string{ - "echo: write error: No space left on device", - "cat: /dev/null: Operation not permitted", - }) -} - -func TestSkipDevicesFalse(t *testing.T) { - // If SkipDevices is not set for the parent slice, access to both - // devices should fail. This is done to assess the test correctness. - // For cgroup v1, we check for m.Set returning EPERM. - // For cgroup v2, we check for the errors below. - testSkipDevices(t, false, []string{ - "/dev/full: Operation not permitted", - "cat: /dev/null: Operation not permitted", - }) -} - func TestFreezeBeforeSet(t *testing.T) { requireV1(t) diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 10ca0eeea7e..17b05a55298 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -12,6 +12,8 @@ import ( securejoin "github.com/cyphar/filepath-securejoin" "golang.org/x/sys/unix" + //nolint:revive // Enable cgroup manager to manage devices + _ "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/cgroups/manager" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" diff --git a/libcontainer/integration/init_test.go b/libcontainer/integration/init_test.go index b6a3714d493..76638ffc7fa 100644 --- a/libcontainer/integration/init_test.go +++ b/libcontainer/integration/init_test.go @@ -6,6 +6,8 @@ import ( "testing" "github.com/opencontainers/runc/libcontainer" + //nolint:revive // Enable cgroup manager to manage devices + _ "github.com/opencontainers/runc/libcontainer/cgroups/devices" _ "github.com/opencontainers/runc/libcontainer/nsenter" "github.com/sirupsen/logrus"