Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let artifact phase and post-command run in grace period #2899

Merged
merged 6 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 3 additions & 20 deletions clicommand/agent_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -843,28 +843,11 @@ var AgentStartCommand = cli.Command{
}
}

// Treat a negative signal grace period as relative to the cancel grace period
if cfg.SignalGracePeriodSeconds < 0 {
if cfg.CancelGracePeriod < -cfg.SignalGracePeriodSeconds {
return fmt.Errorf(
"cancel-grace-period (%d) must be at least as big as signal-grace-period-seconds (%d)",
cfg.CancelGracePeriod,
cfg.SignalGracePeriodSeconds,
)
}
cfg.SignalGracePeriodSeconds = cfg.CancelGracePeriod + cfg.SignalGracePeriodSeconds
}

if cfg.CancelGracePeriod <= cfg.SignalGracePeriodSeconds {
return fmt.Errorf(
"cancel-grace-period (%d) must be greater than signal-grace-period-seconds (%d)",
cfg.CancelGracePeriod,
cfg.SignalGracePeriodSeconds,
)
signalGracePeriod, err := signalGracePeriod(cfg.CancelGracePeriod, cfg.SignalGracePeriodSeconds)
if err != nil {
return err
}

signalGracePeriod := time.Duration(cfg.SignalGracePeriodSeconds) * time.Second

mc := metrics.NewCollector(l, metrics.CollectorConfig{
Datadog: cfg.MetricsDatadog,
DatadogHost: cfg.MetricsDatadogHost,
Expand Down
8 changes: 6 additions & 2 deletions clicommand/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"runtime"
"sync"
"syscall"
"time"

"github.com/buildkite/agent/v3/internal/job"
"github.com/buildkite/agent/v3/process"
Expand Down Expand Up @@ -91,6 +90,7 @@ type BootstrapConfig struct {
Phases []string `cli:"phases" normalize:"list"`
Profile string `cli:"profile"`
CancelSignal string `cli:"cancel-signal"`
CancelGracePeriod int `cli:"cancel-grace-period"`
SignalGracePeriodSeconds int `cli:"signal-grace-period-seconds"`
RedactedVars []string `cli:"redacted-vars" normalize:"list"`
TracingBackend string `cli:"tracing-backend"`
Expand Down Expand Up @@ -371,6 +371,7 @@ var BootstrapCommand = cli.Command{
EnvVar: "BUILDKITE_CONTAINER_ID",
},
cancelSignalFlag,
cancelGracePeriodFlag,
signalGracePeriodSecondsFlag,

// Global flags
Expand Down Expand Up @@ -408,7 +409,10 @@ var BootstrapCommand = cli.Command{
return fmt.Errorf("failed to parse cancel-signal: %w", err)
}

signalGracePeriod := time.Duration(cfg.SignalGracePeriodSeconds) * time.Second
signalGracePeriod, err := signalGracePeriod(cfg.CancelGracePeriod, cfg.SignalGracePeriodSeconds)
if err != nil {
return err
}

// Configure the bootstraper
bootstrap := job.New(job.ExecutorConfig{
Expand Down
37 changes: 36 additions & 1 deletion clicommand/cancel_signal.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package clicommand

import "github.com/urfave/cli"
import (
"fmt"
"time"

"github.com/urfave/cli"
)

const (
defaultCancelGracePeriod = 10
Expand Down Expand Up @@ -30,3 +35,33 @@ var (
Value: -1,
}
)

// signalGracePeriod computes the signal grace period based on the various
// possible flag configurations:
// - If signalGracePeriodSecs is negative, it is relative to
// cancelGracePeriodSecs.
// - If cancelGracePeriodSecs is less than signalGracePeriodSecs that is an
// error.
func signalGracePeriod(cancelGracePeriodSecs, signalGracePeriodSecs int) (time.Duration, error) {
// Treat a negative signal grace period as relative to the cancel grace period
if signalGracePeriodSecs < 0 {
if cancelGracePeriodSecs < -signalGracePeriodSecs {
return 0, fmt.Errorf(
"cancel-grace-period (%d) must be at least as big as signal-grace-period-seconds (%d)",
cancelGracePeriodSecs,
signalGracePeriodSecs,
)
}
signalGracePeriodSecs = cancelGracePeriodSecs + signalGracePeriodSecs
}

if cancelGracePeriodSecs <= signalGracePeriodSecs {
return 0, fmt.Errorf(
"cancel-grace-period (%d) must be greater than signal-grace-period-seconds (%d)",
cancelGracePeriodSecs,
signalGracePeriodSecs,
)
}

return time.Duration(signalGracePeriodSecs) * time.Second, nil
}
61 changes: 28 additions & 33 deletions internal/job/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,10 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {
defer stopper()
defer func() { span.FinishWithError(err) }()

// kind of a weird series of events - wrap a context in a cancellation (at the top of the func), then pull the cancellation
// out again, but some of the stuff we need to do in the executor (namely the teardown) needs to be able to "survive"
// a cancellation so we need to be able to pass a cancellable context to the non-teardown stuff, and an uncancellable
// context to the teardown stuff
nonCancelCtx := context.WithoutCancel(ctx)

// Listen for cancellation
// Listen for cancellation. Once ctx is cancelled, some tasks can run
// afterwards during the signal grace period. These use graceCtx.
graceCtx, graceCancel := withGracePeriod(ctx, e.SignalGracePeriod)
defer graceCancel()
go func() {
<-e.cancelCh
e.shell.Commentf("Received cancellation signal, interrupting")
Expand All @@ -151,7 +148,9 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {

// Tear down the environment (and fire pre-exit hook) before we exit
defer func() {
if err = e.tearDown(nonCancelCtx); err != nil {
// We strive to let the executor tear-down happen whether or not the job
// (and thus ctx) is cancelled, so it can run during the grace period.
if err := e.tearDown(graceCtx); err != nil {
e.shell.Errorf("Error tearing down job executor: %v", err)

// this gets passed back via the named return
Expand Down Expand Up @@ -237,8 +236,10 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {
span.RecordError(commandErr)
}

// Only upload artifacts as part of the command phase
if err = e.artifactPhase(ctx); err != nil {
// Only upload artifacts as part of the command phase.
// The artifacts might be relevant for debugging job timeouts, so it can
// run during the grace period.
if err := e.artifactPhase(graceCtx); err != nil {
e.shell.Errorf("%v", err)

if commandErr != nil {
Expand Down Expand Up @@ -831,58 +832,48 @@ func (e *Executor) tearDown(ctx context.Context) error {
}

// runPreCommandHooks runs the pre-command hooks and adds tracing spans.
func (e *Executor) runPreCommandHooks(ctx context.Context) error {
func (e *Executor) runPreCommandHooks(ctx context.Context) (err error) {
spanName := e.implementationSpecificSpanName("pre-command", "pre-command hooks")
span, ctx := tracetools.StartSpanFromContext(ctx, spanName, e.ExecutorConfig.TracingBackend)
var err error
defer func() { span.FinishWithError(err) }()

if err = e.executeGlobalHook(ctx, "pre-command"); err != nil {
return err
}
if err = e.executeLocalHook(ctx, "pre-command"); err != nil {
if err := e.executeGlobalHook(ctx, "pre-command"); err != nil {
return err
}
if err = e.executePluginHook(ctx, "pre-command", e.pluginCheckouts); err != nil {
if err := e.executeLocalHook(ctx, "pre-command"); err != nil {
return err
}
return nil
return e.executePluginHook(ctx, "pre-command", e.pluginCheckouts)
}

// runCommand runs the command and adds tracing spans.
func (e *Executor) runCommand(ctx context.Context) error {
var err error
// There can only be one command hook, so we check them in order of plugin, local
switch {
case e.hasPluginHook("command"):
err = e.executePluginHook(ctx, "command", e.pluginCheckouts)
return e.executePluginHook(ctx, "command", e.pluginCheckouts)
case e.hasLocalHook("command"):
err = e.executeLocalHook(ctx, "command")
return e.executeLocalHook(ctx, "command")
case e.hasGlobalHook("command"):
err = e.executeGlobalHook(ctx, "command")
return e.executeGlobalHook(ctx, "command")
default:
err = e.defaultCommandPhase(ctx)
return e.defaultCommandPhase(ctx)
}
return err
}

// runPostCommandHooks runs the post-command hooks and adds tracing spans.
func (e *Executor) runPostCommandHooks(ctx context.Context) error {
func (e *Executor) runPostCommandHooks(ctx context.Context) (err error) {
spanName := e.implementationSpecificSpanName("post-command", "post-command hooks")
span, ctx := tracetools.StartSpanFromContext(ctx, spanName, e.ExecutorConfig.TracingBackend)
var err error
defer func() { span.FinishWithError(err) }()

if err = e.executeGlobalHook(ctx, "post-command"); err != nil {
if err := e.executeGlobalHook(ctx, "post-command"); err != nil {
return err
}
if err = e.executeLocalHook(ctx, "post-command"); err != nil {
if err := e.executeLocalHook(ctx, "post-command"); err != nil {
return err
}
if err = e.executePluginHook(ctx, "post-command", e.pluginCheckouts); err != nil {
return err
}
return nil
return e.executePluginHook(ctx, "post-command", e.pluginCheckouts)
}

// CommandPhase determines how to run the build, and then runs it
Expand All @@ -900,7 +891,11 @@ func (e *Executor) CommandPhase(ctx context.Context) (hookErr error, commandErr
if preCommandErr != nil {
return
}
hookErr = e.runPostCommandHooks(ctx)
// Because post-command hooks are often used for post-job cleanup, they
// can run during the grace period.
graceCtx, cancel := withGracePeriod(ctx, e.SignalGracePeriod)
defer cancel()
hookErr = e.runPostCommandHooks(graceCtx)
}()

// Run pre-command hooks
Expand Down
27 changes: 27 additions & 0 deletions internal/job/grace.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package job

import (
"context"
"time"
)

// withGracePeriod returns a context that is cancelled some time *after* the
// parent context is cancelled. In general this is not a good pattern, since it
// breaks the usual connection between context cancellations and requires an
// extra goroutine. However, we need to enforce the signal grace period from
// within the job executor for use-cases where the executor is _not_ forked from
// something else that will enforce the grace period (with SIGKILL).
func withGracePeriod(ctx context.Context, graceTimeout time.Duration) (context.Context, context.CancelFunc) {
gctx, cancel := context.WithCancelCause(context.WithoutCancel(ctx))
go func() {
<-ctx.Done()
select {
case <-time.After(graceTimeout):
cancel(context.DeadlineExceeded)

case <-gctx.Done():
// This can happen if the caller called the cancel func.
}
}()
return gctx, func() { cancel(context.Canceled) }
}