Skip to content

Commit

Permalink
Make failed job acquisitions return a specific exit code (27)
Browse files Browse the repository at this point in the history
  • Loading branch information
moskyb committed May 2, 2024
1 parent cd0a068 commit de3e8d1
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 5 deletions.
12 changes: 8 additions & 4 deletions agent/agent_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ func (a *AgentWorker) Ping(ctx context.Context) (*api.Job, error) {
return ping.Job, nil
}

var ErrJobAcquisitionFailure = errors.New("failed to acquire job")

// AcquireAndRunJob attempts to acquire a job an run it. It will retry at after the
// server determined interval (from the Retry-After response header) if the job is in the waiting
// state. If the job is in an unassignable state, it will return an error immediately.
Expand Down Expand Up @@ -594,11 +596,13 @@ func (a *AgentWorker) AcquireAndRunJob(ctx context.Context, jobId string) error
if resp != nil {
switch resp.StatusCode {
case http.StatusUnprocessableEntity:
// If the API returns with a 422, that means that we
// successfully *tried* to acquire the job, but
// Buildkite rejected the finish for some reason.
// If the API returns with a 422, it usually means that the job is in a state where it can't be acquired -
// e.g. it's already running on another agent, or has been cancelled, or has already run
a.logger.Warn("Buildkite rejected the call to acquire the job (%s)", err)
r.Break()

return nil, fmt.Errorf("%w: %w", ErrJobAcquisitionFailure, err)

case http.StatusLocked:
// If the API returns with a 423, the job is in the waiting state
a.logger.Warn("The job is waiting for a dependency (%s)", err)
Expand All @@ -619,7 +623,7 @@ func (a *AgentWorker) AcquireAndRunJob(ctx context.Context, jobId string) error

// If `acquiredJob` is nil, then the job was never acquired
if acquiredJob == nil {
return fmt.Errorf("Failed to acquire job: %w", err)
return fmt.Errorf("failed to acquire job: %w", err)
}

// Now that we've acquired the job, let's run it
Expand Down
41 changes: 41 additions & 0 deletions agent/agent_worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package agent

import (
"context"
"errors"
"fmt"
"math"
"net/http"
Expand Down Expand Up @@ -109,6 +110,42 @@ func TestDisconnectRetry(t *testing.T) {
assert.Equal(t, "[info] Disconnected", l.Messages[3])
}

func TestAcquireJobReturnsWrappedError_WhenServerResponds422(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

jobID := "some-uuid"

server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
switch req.URL.Path {
case fmt.Sprintf("/jobs/%s/acquire", jobID):
rw.WriteHeader(http.StatusUnprocessableEntity)
return

default:
http.Error(rw, "Not found", http.StatusNotFound)
}
}))
defer server.Close()

client := api.NewClient(logger.Discard, api.Config{
Endpoint: server.URL,
Token: "llamas",
})

worker := &AgentWorker{
logger: logger.Discard,
agent: nil,
apiClient: client,
agentConfiguration: AgentConfiguration{},
}

err := worker.AcquireAndRunJob(ctx, jobID)
if !errors.Is(err, ErrJobAcquisitionFailure) {
t.Fatalf("expected worker.AcquireAndRunJob(%q) = ErrJobAcquisitionFailure, got %v", jobID, err)
}
}

func TestAcquireAndRunJobWaiting(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
Expand Down Expand Up @@ -157,6 +194,10 @@ func TestAcquireAndRunJobWaiting(t *testing.T) {
err := worker.AcquireAndRunJob(ctx, "waitinguuid")
assert.ErrorContains(t, err, "423")

if errors.Is(err, ErrJobAcquisitionFailure) {
t.Fatalf("expected worker.AcquireAndRunJob(%q) not to be a ErrJobAcquisitionFailure, but it was: %v", "waitinguuid", err)
}

// the last Retry-After is not recorded as the retries loop exits before using it
exptectedSleeps := make([]time.Duration, 0, 9)
for d := 1; d <= 1<<8; d *= 2 {
Expand Down
11 changes: 10 additions & 1 deletion clicommand/agent_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,16 @@ var AgentStartCommand = cli.Command{
pool.StartStatusServer(ctx, l, cfg.HealthCheckAddr)
}

return pool.Start(ctx)
err = pool.Start(ctx)
if errors.Is(err, agent.ErrJobAcquisitionFailure) {
// If the agent tried to acquire a job, but it couldn't because the job was already taken, we should exit with a
// specific exit code so that the caller can know that this job can't be acquired.

const acquisitionFailedExitCode = 27 // chosen by fair dice roll
return cli.NewExitError(err, acquisitionFailedExitCode)
}

return err
},
}

Expand Down

0 comments on commit de3e8d1

Please sign in to comment.