Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

template: fix panic in change_mode=script on client restart #24057

Merged
merged 1 commit into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/24057.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
template: Fixed a panic on client restart when using change_mode=script
```
1 change: 1 addition & 0 deletions client/allocrunner/interfaces/task_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ type TaskPoststartRequest struct {
// Stats collector
DriverStats DriverStats
}

type TaskPoststartResponse struct{}

type TaskPoststartHook interface {
Expand Down
6 changes: 6 additions & 0 deletions client/allocrunner/taskrunner/driver_handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ func (h *DriverHandle) Signal(s string) error {

// Exec is the handled used by client endpoint handler to invoke the appropriate task driver exec.
func (h *DriverHandle) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
if h == nil {
return nil, 0, ErrTaskNotRunning
}
command := append([]string{cmd}, args...)
res, err := h.driver.ExecTask(h.taskID, command, timeout)
if err != nil {
Expand All @@ -80,6 +83,9 @@ func (h *DriverHandle) ExecStreaming(ctx context.Context,
command []string,
tty bool,
stream drivers.ExecTaskStream) error {
if h == nil {
return ErrTaskNotRunning
}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes aren't really necessary now, but they'll provide belt-and-suspenders against future bugs.


if impl, ok := h.driver.(drivers.ExecTaskStreamingRawDriver); ok {
return impl.ExecTaskStreamingRaw(ctx, h.taskID, command, tty, stream)
Expand Down
4 changes: 4 additions & 0 deletions client/allocrunner/taskrunner/interfaces/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package interfaces

import (
"context"
"time"

"github.com/hashicorp/nomad/nomad/structs"
)
Expand All @@ -20,6 +21,9 @@ type TaskLifecycle interface {
// Kill a task permanently.
Kill(ctx context.Context, event *structs.TaskEvent) error

// Exec into a running task.
Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error)

// IsRunning returns true if the task runner has a handle to the task
// driver, which is useful for distinguishing restored tasks during
// prestart hooks. But note that the driver handle could go away after you
Expand Down
13 changes: 13 additions & 0 deletions client/allocrunner/taskrunner/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package taskrunner

import (
"context"
"time"

"github.com/hashicorp/nomad/nomad/structs"
)
Expand Down Expand Up @@ -126,6 +127,18 @@ func (tr *TaskRunner) restartImpl(ctx context.Context, event *structs.TaskEvent,
return nil
}

func (tr *TaskRunner) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
tr.logger.Trace("Exec requested")

handle := tr.getDriverHandle()
if handle == nil {
return nil, 0, ErrTaskNotRunning
}

out, code, err := handle.Exec(timeout, cmd, args)
return out, code, err
}

func (tr *TaskRunner) Signal(event *structs.TaskEvent, s string) error {
tr.logger.Trace("Signal requested", "signal", s)

Expand Down
1 change: 0 additions & 1 deletion client/allocrunner/taskrunner/task_runner_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ func (tr *TaskRunner) initHooks() {
consulNamespace: consulNamespace,
nomadNamespace: tr.alloc.Job.Namespace,
renderOnTaskRestart: task.RestartPolicy.RenderTemplates,
driverHandle: tr.handle,
}))
}

Expand Down
32 changes: 4 additions & 28 deletions client/allocrunner/taskrunner/template/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,6 @@ type TaskTemplateManager struct {
// runner is the consul-template runner
runner *manager.Runner

// handle is used to execute scripts
handle interfaces.ScriptExecutor
handleLock sync.Mutex

// signals is a lookup map from the string representation of a signal to its
// actual signal
signals map[string]os.Signal
Expand Down Expand Up @@ -220,14 +216,6 @@ func (tm *TaskTemplateManager) Stop() {
}
}

// SetDriverHandle sets the executor
func (tm *TaskTemplateManager) SetDriverHandle(executor interfaces.ScriptExecutor) {
tm.handleLock.Lock()
defer tm.handleLock.Unlock()
tm.handle = executor

}

// run is the long lived loop that handles errors and templates being rendered
func (tm *TaskTemplateManager) run() {
// Runner is nil if there are no templates
Expand Down Expand Up @@ -583,21 +571,10 @@ func (tm *TaskTemplateManager) handleScriptError(script *structs.ChangeScript, m
func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *sync.WaitGroup) {
defer wg.Done()

tm.handleLock.Lock()
defer tm.handleLock.Unlock()
if tm.handle == nil {
failureMsg := fmt.Sprintf(
"Template failed to run script %v with arguments %v because task driver handle is not available",
script.Command,
script.Args,
)
tm.handleScriptError(script, failureMsg)
return
}
_, exitCode, err := tm.handle.Exec(script.Timeout, script.Command, script.Args)
_, exitCode, err := tm.config.Lifecycle.Exec(script.Timeout, script.Command, script.Args)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this feels so much nicer 👏

if err != nil {
failureMsg := fmt.Sprintf(
"Template failed to run script %v with arguments %v on change: %v Exit code: %v",
"Template failed to run script %v with arguments %v on change: %v. Exit code: %v",
script.Command,
script.Args,
err,
Expand All @@ -608,7 +585,7 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
}
if exitCode != 0 {
failureMsg := fmt.Sprintf(
"Template ran script %v with arguments %v on change but it exited with code code: %v",
"Template ran script %v with arguments %v on change but it exited with code: %v",
script.Command,
script.Args,
exitCode,
Expand All @@ -619,10 +596,9 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
tm.config.Events.EmitEvent(structs.NewTaskEvent(structs.TaskHookMessage).
SetDisplayMessage(
fmt.Sprintf(
"Template successfully ran script %v with arguments: %v. Exit code: %v",
"Template successfully ran script %v with arguments: %v. Exit code: 0",
script.Command,
script.Args,
exitCode,
)))
}

Expand Down
44 changes: 4 additions & 40 deletions client/allocrunner/taskrunner/template/template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,6 @@ const (
TestTaskName = "test-task"
)

// mockExecutor implements script executor interface
type mockExecutor struct {
DesiredExit int
DesiredErr error
}

func (m *mockExecutor) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
return []byte{}, m.DesiredExit, m.DesiredErr
}

// testHarness is used to test the TaskTemplateManager by spinning up
// Consul/Vault as needed
type testHarness struct {
Expand Down Expand Up @@ -1146,7 +1136,7 @@ func TestTaskTemplateManager_ScriptExecution(t *testing.T) {
key2 := "bar"
content1_1 := "cat"
content1_2 := "dog"
content1_3 := "goldfish"

t1 := &structs.Template{
EmbeddedTmpl: `
FOO={{key "bam"}}
Expand Down Expand Up @@ -1176,10 +1166,9 @@ BAR={{key "bar"}}
Envvars: true,
}

me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
harness.mockHooks.SetupExecTest(0, nil)
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()

// Ensure no unblock
Expand Down Expand Up @@ -1220,29 +1209,6 @@ OUTER:
t.Fatal(t, "should have received an event")
}
}

// remove the reference to the task handle and update the template contents
Copy link
Member Author

@tgross tgross Sep 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to reviewers: this is the test we added in #23917 that we knew even at the time didn't really test anything useful. Now that we've fixed the problem by fixing the architecture, this doesn't even have anything left to test so I'm removing it. There's no way to "remove the task handle" anymore, as the template manager doesn't have a reference to it at all. The code for this PR is covered in task_runner_hook_test.go

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense to remove this, but I don't see a test file called task_runner_hook_test.go?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, that's a typo, it's this test: https://github.com/hashicorp/nomad/blob/main/client/allocrunner/taskrunner/template_hook_test.go#L277 Note that only an integration test that spins up a real task and can restart a client can actually exercise this bug. So that falls under the general pile of "things we'd really like to do when we have solid E2E upgrade/restart testing".

// again
harness.manager.SetDriverHandle(nil)
harness.consul.SetKV(t, key1, []byte(content1_3))
timeout = time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second)

OUTER2:
for {
select {
case <-harness.mockHooks.RestartCh:
t.Fatal(t, "restart not expected")
case ev := <-harness.mockHooks.EmitEventCh:
if strings.Contains(
ev.DisplayMessage, "task driver handle is not available") {
break OUTER2
}
case <-harness.mockHooks.SignalCh:
t.Fatal(t, "signal not expected")
case <-timeout:
t.Fatal(t, "should have received an event that task driver handle is unavailable")
}
}
}

// TestTaskTemplateManager_ScriptExecutionFailTask tests whether we fail the
Expand Down Expand Up @@ -1285,10 +1251,9 @@ BAR={{key "bar"}}
Envvars: true,
}

me := mockExecutor{DesiredExit: 1, DesiredErr: fmt.Errorf("Script failed")}
harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
harness.mockHooks.SetupExecTest(1, fmt.Errorf("Script failed"))
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()

// Ensure no unblock
Expand Down Expand Up @@ -1365,10 +1330,9 @@ COMMON={{key "common"}}
templateScript,
}

me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
harness := newTestHarness(t, templates, true, false)
harness.mockHooks.SetupExecTest(0, nil)
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()

// Ensure no unblock
Expand Down
39 changes: 0 additions & 39 deletions client/allocrunner/taskrunner/template_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ type templateHookConfig struct {

// hookResources are used to fetch Consul tokens
hookResources *cstructs.AllocHookResources

// driverHandle is the task driver executor used to run scripts when the
// template change mode is set to script. Typically this will be nil in this
// config struct, unless we're restoring a task after a client restart.
driverHandle ti.ScriptExecutor
}

type templateHook struct {
Expand All @@ -72,15 +67,6 @@ type templateHook struct {
templateManager *template.TaskTemplateManager
managerLock sync.Mutex

// driverHandle is the task driver executor used by the template manager to
// run scripts when the template change mode is set to script. This value is
// set in the Poststart hook after the task has run, or passed in as
// configuration if this is a task that's being restored after a client
// restart.
//
// Must obtain a managerLock before changing. It may be nil.
driverHandle ti.ScriptExecutor

// consulNamespace is the current Consul namespace
consulNamespace string

Expand Down Expand Up @@ -113,7 +99,6 @@ func newTemplateHook(config *templateHookConfig) *templateHook {
config: config,
consulNamespace: config.consulNamespace,
logger: config.logger.Named(templateHookName),
driverHandle: config.driverHandle,
}
}

Expand Down Expand Up @@ -201,27 +186,6 @@ func (h *templateHook) Prestart(ctx context.Context, req *interfaces.TaskPrestar
return nil
}

func (h *templateHook) Poststart(_ context.Context, req *interfaces.TaskPoststartRequest, resp *interfaces.TaskPoststartResponse) error {
h.managerLock.Lock()
defer h.managerLock.Unlock()

if h.templateManager == nil {
return nil
}

if req.DriverExec != nil {
h.driverHandle = req.DriverExec
h.templateManager.SetDriverHandle(h.driverHandle)
} else {
for _, tmpl := range h.config.templates {
if tmpl.ChangeMode == structs.TemplateChangeModeScript {
return fmt.Errorf("template has change mode set to 'script' but task driver handle is not available")
}
}
}
return nil
}

func (h *templateHook) newManager() (unblock chan struct{}, err error) {
unblock = make(chan struct{})

Expand Down Expand Up @@ -263,9 +227,6 @@ func (h *templateHook) newManager() (unblock chan struct{}, err error) {
}

h.templateManager = m
if h.driverHandle != nil {
h.templateManager.SetDriverHandle(h.driverHandle)
}
return unblock, nil
}

Expand Down
16 changes: 5 additions & 11 deletions client/allocrunner/taskrunner/template_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,9 @@ func Test_templateHook_Prestart_ConsulWI(t *testing.T) {
hookResources: tt.hr,
}
h := &templateHook{
config: conf,
logger: logger,
managerLock: sync.Mutex{},
driverHandle: nil,
config: conf,
logger: logger,
managerLock: sync.Mutex{},
}
req := &interfaces.TaskPrestartRequest{
Alloc: a,
Expand Down Expand Up @@ -289,15 +288,11 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
envBuilder := taskenv.NewBuilder(mock.Node(), alloc, task, clientConfig.Region)

lifecycle := trtesting.NewMockTaskHooks()
lifecycle.SetupExecTest(117, fmt.Errorf("oh no"))
lifecycle.HasHandle = true

events := &trtesting.MockEmitter{}

executor := &simpleExec{
code: 117,
err: fmt.Errorf("oh no"),
}

hook := newTemplateHook(&templateHookConfig{
alloc: alloc,
logger: logger,
Expand All @@ -315,7 +310,6 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
clientConfig: clientConfig,
envBuilder: envBuilder,
hookResources: &cstructs.AllocHookResources{},
driverHandle: executor,
})
req := &interfaces.TaskPrestartRequest{
Alloc: alloc,
Expand All @@ -334,7 +328,7 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
gotEvents := events.Events()
must.Len(t, 1, gotEvents)
must.Eq(t, structs.TaskHookFailed, gotEvents[0].Type)
must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no Exit code: 117",
must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no. Exit code: 117",
gotEvents[0].DisplayMessage)

}
Loading