Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

client: prevent start on cgroups init error #19915

Merged
merged 2 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/19915.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
client: Prevent client from starting if cgroup initialization fails
```
5 changes: 4 additions & 1 deletion client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,10 +477,13 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie
)

// Create the process wranglers
wranglers := proclib.New(&proclib.Configs{
wranglers, err := proclib.New(&proclib.Configs{
UsableCores: c.topology.UsableCores(),
Logger: c.logger.Named("proclib"),
})
if err != nil {
return nil, fmt.Errorf("failed to initialize process manager: %w", err)
}
c.wranglers = wranglers

// Build the allow/denylists of drivers.
Expand Down
62 changes: 23 additions & 39 deletions client/lib/cgroupslib/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package cgroupslib

import (
"bytes"
"fmt"
"os"
"path/filepath"

Expand All @@ -23,7 +24,7 @@ const (

// Init will initialize the cgroup tree that the Nomad client will use for
// isolating resources of tasks. cores is the cpuset granted for use by Nomad.
func Init(log hclog.Logger, cores string) {
func Init(log hclog.Logger, cores string) error {
log.Info("initializing nomad cgroups", "cores", cores)

switch GetMode() {
Expand All @@ -41,8 +42,7 @@ func Init(log hclog.Logger, cores string) {
for _, ctrl := range controllers {
p := filepath.Join(root, ctrl, NomadCgroupParent)
if err := os.MkdirAll(p, 0755); err != nil {
log.Error("failed to create nomad cgroup", "controller", ctrl, "error", err)
return
return fmt.Errorf("failed to create nomad cgroup %s: %w", ctrl, err)
}
}

Expand All @@ -56,8 +56,7 @@ func Init(log hclog.Logger, cores string) {
// band from nomad itself
var memsSet string
if mems, err := detectMemsCG1(); err != nil {
log.Error("failed to detect memset", "error", err)
return
return fmt.Errorf("failed to detect memset: %w", err)
} else {
memsSet = mems
}
Expand All @@ -78,56 +77,47 @@ func Init(log hclog.Logger, cores string) {
// def456.task/{cgroup.procs, cpuset.cpus, cpuset.mems}

if err := writeCG(noClone, "cpuset", NomadCgroupParent, cloneFile); err != nil {
log.Error("failed to set clone_children on nomad cpuset cgroup", "error", err)
return
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}

if err := writeCG(memsSet, "cpuset", NomadCgroupParent, memsFile); err != nil {
log.Error("failed to set cpuset.mems on nomad cpuset cgroup", "error", err)
return
return fmt.Errorf("failed to set cpuset.mems on nomad cpuset cgroup: %w", err)
}

if err := writeCG(cores, "cpuset", NomadCgroupParent, cpusetFile); err != nil {
log.Error("failed to write cores to nomad cpuset cgroup", "error", err)
return
return fmt.Errorf("failed to write cores to nomad cpuset cgroup: %w", err)
}

//
// share partition
//

if err := mkCG("cpuset", NomadCgroupParent, SharePartition()); err != nil {
log.Error("failed to create share cpuset partition", "error", err)
return
return fmt.Errorf("failed to create share cpuset partition: %w", err)
}

if err := writeCG(noClone, "cpuset", NomadCgroupParent, SharePartition(), cloneFile); err != nil {
log.Error("failed to set clone_children on nomad cpuset cgroup", "error", err)
return
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}

if err := writeCG(memsSet, "cpuset", NomadCgroupParent, SharePartition(), memsFile); err != nil {
log.Error("failed to set cpuset.mems on share cpuset partition", "error", err)
return
return fmt.Errorf("failed to set cpuset.mems on share cpuset partition: %w", err)
}

//
// reserve partition
//

if err := mkCG("cpuset", NomadCgroupParent, ReservePartition()); err != nil {
log.Error("failed to create reserve cpuset partition", "error", err)
return
return fmt.Errorf("failed to create reserve cpuset partition: %w", err)
}

if err := writeCG(noClone, "cpuset", NomadCgroupParent, ReservePartition(), cloneFile); err != nil {
log.Error("failed to set clone_children on nomad cpuset cgroup", "error", err)
return
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}

if err := writeCG(memsSet, "cpuset", NomadCgroupParent, ReservePartition(), memsFile); err != nil {
log.Error("failed to set cpuset.mems on reserve cpuset partition", "error", err)
return
return fmt.Errorf("failed to set cpuset.mems on reserve cpuset partition: %w", err)
}

log.Debug("nomad cpuset partitions initialized", "cores", cores)
Expand All @@ -144,27 +134,23 @@ func Init(log hclog.Logger, cores string) {
//

if err := writeCG(activation, subtreeFile); err != nil {
log.Error("failed to create nomad cgroup", "error", err)
return
return fmt.Errorf("failed to create nomad cgroup: %w", err)
}

//
// configuring nomad.slice
//

if err := mkCG(NomadCgroupParent); err != nil {
log.Error("failed to create nomad cgroup", "error", err)
return
return fmt.Errorf("failed to create nomad cgroup: %w", err)
}

if err := writeCG(activation, NomadCgroupParent, subtreeFile); err != nil {
log.Error("failed to set subtree control on nomad cgroup", "error", err)
return
return fmt.Errorf("failed to set subtree control on nomad cgroup: %w", err)
}

if err := writeCG(cores, NomadCgroupParent, cpusetFile); err != nil {
log.Error("failed to write root partition cpuset", "error", err)
return
return fmt.Errorf("failed to write root partition cpuset: %w", err)
}

log.Debug("top level partition root nomad.slice cgroup initialized")
Expand All @@ -174,13 +160,11 @@ func Init(log hclog.Logger, cores string) {
//

if err := mkCG(NomadCgroupParent, SharePartition()); err != nil {
log.Error("failed to create share cgroup", "error", err)
return
return fmt.Errorf("failed to create share cgroup: %w", err)
}

if err := writeCG(activation, NomadCgroupParent, SharePartition(), subtreeFile); err != nil {
log.Error("failed to set subtree control on cpuset share partition", "error", err)
return
return fmt.Errorf("failed to set subtree control on cpuset share partition: %w", err)
}

log.Debug("partition member nomad.slice/share cgroup initialized")
Expand All @@ -190,17 +174,17 @@ func Init(log hclog.Logger, cores string) {
//

if err := mkCG(NomadCgroupParent, ReservePartition()); err != nil {
log.Error("failed to create share cgroup", "error", err)
return
return fmt.Errorf("failed to create share cgroup: %w", err)
}

if err := writeCG(activation, NomadCgroupParent, ReservePartition(), subtreeFile); err != nil {
log.Error("failed to set subtree control on cpuset reserve partition", "error", err)
return
return fmt.Errorf("failed to set subtree control on cpuset reserve partition: %w", err)
}

log.Debug("partition member nomad.slice/reserve cgroup initialized")
}

return nil
}

// detectMemsCG1 will determine the cpuset.mems value to use for
Expand Down
10 changes: 7 additions & 3 deletions client/lib/proclib/wrangler_cg1_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,20 @@ type LinuxWranglerCG1 struct {
cg cgroupslib.Lifecycle
}

func newCG1(c *Configs) create {
func newCG1(c *Configs) (create, error) {
logger := c.Logger.Named("cg1")
cgroupslib.Init(logger, c.UsableCores.String())
err := cgroupslib.Init(logger, c.UsableCores.String())
if err != nil {
return nil, err
}

return func(task Task) ProcessWrangler {
return &LinuxWranglerCG1{
task: task,
log: logger,
cg: cgroupslib.Factory(task.AllocID, task.Task, task.Cores),
}
}
}, nil
}

func (w *LinuxWranglerCG1) Initialize() error {
Expand Down
10 changes: 7 additions & 3 deletions client/lib/proclib/wrangler_cg2_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,20 @@ type LinuxWranglerCG2 struct {
cg cgroupslib.Lifecycle
}

func newCG2(c *Configs) create {
func newCG2(c *Configs) (create, error) {
logger := c.Logger.Named("cg2")
cgroupslib.Init(logger, c.UsableCores.String())
err := cgroupslib.Init(logger, c.UsableCores.String())
if err != nil {
return nil, err
}

return func(task Task) ProcessWrangler {
return &LinuxWranglerCG2{
task: task,
log: c.Logger,
cg: cgroupslib.Factory(task.AllocID, task.Task, task.Cores),
}
}
}, nil
}

func (w LinuxWranglerCG2) Initialize() error {
Expand Down
4 changes: 2 additions & 2 deletions client/lib/proclib/wrangler_default.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ package proclib

// New creates a Wranglers backed by the DefaultWrangler implementation, which
// does not do anything.
func New(configs *Configs) *Wranglers {
func New(configs *Configs) (*Wranglers, error) {
w := &Wranglers{
configs: configs,
m: make(map[Task]ProcessWrangler),
create: doNothing(configs),
}

return w
return w, nil
}

func doNothing(*Configs) create {
Expand Down
9 changes: 5 additions & 4 deletions client/lib/proclib/wrangler_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,19 @@ import (

// New creates a Wranglers factory for creating ProcessWrangler's appropriate
// for the given system (i.e. cgroups v1 or cgroups v2).
func New(configs *Configs) *Wranglers {
func New(configs *Configs) (*Wranglers, error) {
w := &Wranglers{
configs: configs,
m: make(map[Task]ProcessWrangler),
}

var err error
switch cgroupslib.GetMode() {
case cgroupslib.CG1:
w.create = newCG1(configs)
w.create, err = newCG1(configs)
default:
w.create = newCG2(configs)
w.create, err = newCG2(configs)
}

return w
return w, err
}
Loading