diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index 6296719b733..c6d92bef4a0 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -31,6 +31,7 @@ var ( &NetPrioGroup{}, &PerfEventGroup{}, &FreezerGroup{}, + &RdmaGroup{}, &NameGroup{GroupName: "name=systemd", Join: true}, } HugePageSizes, _ = cgroups.GetHugePageSize() diff --git a/libcontainer/cgroups/fs/rdma.go b/libcontainer/cgroups/fs/rdma.go new file mode 100644 index 00000000000..5fb25abe766 --- /dev/null +++ b/libcontainer/cgroups/fs/rdma.go @@ -0,0 +1,25 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type RdmaGroup struct{} + +func (s *RdmaGroup) Name() string { + return "rdma" +} + +func (s *RdmaGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *RdmaGroup) Set(path string, r *configs.Resources) error { + return fscommon.RdmaSet(path, r) +} + +func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { + return fscommon.RdmaGetStats(path, stats) +} diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 6cd9f51f65d..84e4ba89539 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -125,6 +125,10 @@ func (m *manager) GetStats() (*cgroups.Stats, error) { if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } + // rdma (since kernel 4.11) + if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } if len(errs) > 0 && !m.rootless { return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) } @@ -183,6 +187,10 @@ func (m *manager) Set(r *configs.Resources) error { if err := setHugeTlb(m.dirPath, r); err != nil { return err } + // rdma (since kernel 4.11) + if err := fscommon.RdmaSet(m.dirPath, r); err != nil { + return err + } // freezer (since kernel 5.2, pseudo-controller) if err := setFreezer(m.dirPath, r.Freezer); err != nil { return err diff --git a/libcontainer/cgroups/fscommon/rdma.go b/libcontainer/cgroups/fscommon/rdma.go new file mode 100644 index 00000000000..d463d15ee4e --- /dev/null +++ b/libcontainer/cgroups/fscommon/rdma.go @@ -0,0 +1,121 @@ +package fscommon + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +// parseRdmaKV parses raw string to RdmaEntry. +func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { + var value uint32 + + parts := strings.SplitN(raw, "=", 3) + + if len(parts) != 2 { + return errors.New("Unable to parse RDMA entry") + } + + k, v := parts[0], parts[1] + + if v == "max" { + value = math.MaxUint32 + } else { + val64, err := strconv.ParseUint(v, 10, 32) + if err != nil { + return err + } + value = uint32(val64) + } + if k == "hca_handle" { + entry.HcaHandles = value + } else if k == "hca_object" { + entry.HcaObjects = value + } + + return nil +} + +// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. +// example entry: mlx4_0 hca_handle=2 hca_object=2000 +func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { + rdmaEntries := make([]cgroups.RdmaEntry, 0) + fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return nil, err + } + defer fd.Close() //nolint:errorlint + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), " ", 4) + if len(parts) == 3 { + entry := new(cgroups.RdmaEntry) + entry.Device = parts[0] + err = parseRdmaKV(parts[1], entry) + if err != nil { + continue + } + err = parseRdmaKV(parts[2], entry) + if err != nil { + continue + } + + rdmaEntries = append(rdmaEntries, *entry) + } + } + return rdmaEntries, scanner.Err() +} + +// RdmaGetStats returns rdma stats such as totalLimit and current entries. +func RdmaGetStats(path string, stats *cgroups.Stats) error { + currentEntries, err := readRdmaEntries(path, "rdma.current") + if err != nil { + if errors.Is(err, os.ErrNotExist) { + err = nil + } + return err + } + maxEntries, err := readRdmaEntries(path, "rdma.max") + if err != nil { + return err + } + // If device got removed between reading two files, ignore returning stats. + if len(currentEntries) != len(maxEntries) { + return nil + } + + stats.RdmaStats = cgroups.RdmaStats{ + RdmaLimit: maxEntries, + RdmaCurrent: currentEntries, + } + + return nil +} + +func createCmdString(device string, limits configs.LinuxRdma) string { + cmdString := device + if limits.HcaHandles != nil { + cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + if limits.HcaObjects != nil { + cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +// RdmaSet sets RDMA resources. +func RdmaSet(path string, r *configs.Resources) error { + for device, limits := range r.Rdma { + if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/cgroups/fscommon/rdma_test.go b/libcontainer/cgroups/fscommon/rdma_test.go new file mode 100644 index 00000000000..9bc08190d12 --- /dev/null +++ b/libcontainer/cgroups/fscommon/rdma_test.go @@ -0,0 +1,57 @@ +package fscommon + +import ( + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* Roadmap for future */ +// (Low-priority) TODO: Check if it is possible to virtually mimic an actual RDMA device. +// TODO: Think of more edge-cases to add. + +// TestRdmaSet performs an E2E test of RdmaSet(), parseRdmaKV() using dummy device and a dummy cgroup file-system. +// Note: Following test does not guarantees that your host supports RDMA since this mocks underlying infrastructure. +func TestRdmaSet(t *testing.T) { + testCgroupPath := filepath.Join(t.TempDir(), "rdma") + + // Ensure the full mock cgroup path exists. + err := os.Mkdir(testCgroupPath, 0o755) + if err != nil { + t.Fatal(err) + } + + rdmaDevice := "mlx5_1" + maxHandles := uint32(100) + maxObjects := uint32(300) + + rdmaStubResource := &configs.Resources{ + Rdma: map[string]configs.LinuxRdma{ + rdmaDevice: { + HcaHandles: &maxHandles, + HcaObjects: &maxObjects, + }, + }, + } + + if err := RdmaSet(testCgroupPath, rdmaStubResource); err != nil { + t.Fatal(err) + } + + // The default rdma.max must be written. + rdmaEntries, err := readRdmaEntries(testCgroupPath, "rdma.max") + if err != nil { + t.Fatal(err) + } + if len(rdmaEntries) != 1 { + t.Fatal("rdma_test: Got the wrong values while parsing entries from rdma.max") + } + if rdmaEntries[0].HcaHandles != maxHandles { + t.Fatalf("rdma_test: Got the wrong value for hca_handles") + } + if rdmaEntries[0].HcaObjects != maxObjects { + t.Fatalf("rdma_test: Got the wrong value for hca_Objects") + } +} diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go index e7f9c462635..8f7301da0d4 100644 --- a/libcontainer/cgroups/stats.go +++ b/libcontainer/cgroups/stats.go @@ -146,6 +146,17 @@ type HugetlbStats struct { Failcnt uint64 `json:"failcnt"` } +type RdmaEntry struct { + Device string `json:"device,omitempty"` + HcaHandles uint32 `json:"hca_handles,omitempty"` + HcaObjects uint32 `json:"hca_objects,omitempty"` +} + +type RdmaStats struct { + RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"` + RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` +} + type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` @@ -154,6 +165,7 @@ type Stats struct { BlkioStats BlkioStats `json:"blkio_stats,omitempty"` // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` + RdmaStats RdmaStats `json:"rdma_stats,omitempty"` } func NewStats() *Stats { diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 1a8e1e3c6c1..a0a68c78a80 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -58,6 +58,7 @@ var legacySubsystems = []subsystem{ &fs.NetPrioGroup{}, &fs.NetClsGroup{}, &fs.NameGroup{GroupName: "name=systemd"}, + &fs.RdmaGroup{}, } func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index a1e7f0afd44..b08a3715a30 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -117,6 +117,9 @@ type Resources struct { // Set class identifier for container's network packets NetClsClassid uint32 `json:"net_cls_classid_u"` + // Rdma resource restriction configuration + Rdma map[string]LinuxRdma `json:"rdma"` + // Used on cgroups v2: // CpuWeight sets a proportional bandwidth limit. diff --git a/libcontainer/configs/rdma.go b/libcontainer/configs/rdma.go new file mode 100644 index 00000000000..c69f2c8024b --- /dev/null +++ b/libcontainer/configs/rdma.go @@ -0,0 +1,9 @@ +package configs + +// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11) +type LinuxRdma struct { + // Maximum number of HCA handles that can be opened. Default is "no limit". + HcaHandles *uint32 `json:"hca_handles,omitempty"` + // Maximum number of HCA objects that can be created. Default is "no limit". + HcaObjects *uint32 `json:"hca_objects,omitempty"` +} diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 89172316358..3831919d888 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -607,6 +607,15 @@ func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*confi Limit: l.Limit, }) } + if len(r.Rdma) > 0 { + c.Resources.Rdma = make(map[string]configs.LinuxRdma, len(r.Rdma)) + for k, v := range r.Rdma { + c.Resources.Rdma[k] = configs.LinuxRdma{ + HcaHandles: v.HcaHandles, + HcaObjects: v.HcaObjects, + } + } + } if r.Network != nil { if r.Network.ClassID != nil { c.Resources.NetClsClassid = *r.Network.ClassID