Skip to content

Commit

Permalink
Terminate TaskRun when Pod fails due to ImagePullBackOff.
Browse files Browse the repository at this point in the history
Prior to this, if the Pod was in ImagePullBackOff state,
the TaskRun would remain `Running` with the message `Pending` until it eventually timed out.
This led to lots of delays. The expected behavior should have been to
terminate the TaskRun and set it to `fail`. This PR addresses issue
tektoncd#4895.
  • Loading branch information
chitrangpatel authored and tekton-robot committed Jun 9, 2022
1 parent a4b3468 commit 4621a66
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,7 @@ False|\[Error message\]|No|The TaskRun encountered a non-permanent error, and it
False|\[Error message\]|Yes|The TaskRun failed with a permanent error (usually validation).
False|TaskRunCancelled|Yes|The TaskRun was cancelled successfully.
False|TaskRunTimeout|Yes|The TaskRun timed out.
False|TaskRunImagePullFailed|Yes|The TaskRun failed due to one of its steps not being able to pull the image.

When a `TaskRun` changes status, [events](events.md#taskruns) are triggered accordingly.

Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/pipeline/v1beta1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ const (
// TaskRunReasonResolvingTaskRef indicates that the TaskRun is waiting for
// its taskRef to be asynchronously resolved.
TaskRunReasonResolvingTaskRef = "ResolvingTaskRef"
// TaskRunReasonImagePullFailed is the reason set when the step of a task fails due to image not being pulled
TaskRunReasonImagePullFailed TaskRunReason = "TaskRunImagePullFailed"
)

func (t TaskRunReason) String() string {
Expand Down
16 changes: 16 additions & 0 deletions pkg/reconciler/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,12 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1beta1.TaskRun) pkg
return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err)
}

// Check for Pod Failures
if failed, message := c.checkPodFailures(ctx, tr); failed {
err := c.failTaskRun(ctx, tr, v1beta1.TaskRunReasonImagePullFailed, message)
return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err)
}

// prepare fetches all required resources, validates them together with the
// taskrun, runs API conversions. In case of error we update, emit events and return.
_, rtr, err := c.prepare(ctx, tr)
Expand Down Expand Up @@ -188,6 +194,16 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1beta1.TaskRun) pkg
return nil
}

func (c *Reconciler) checkPodFailures(ctx context.Context, tr *v1beta1.TaskRun) (bool, string) {
for _, step := range tr.Status.Steps {
if step.Waiting != nil && step.Waiting.Reason == "ImagePullBackOff" {
message := fmt.Sprintf(`A step in TaskRun %q failed to pull the image. The pod errored with the message: "%s."`, tr.Name, step.Waiting.Message)
return true, message
}
}
return false, ""
}

func (c *Reconciler) durationAndCountMetrics(ctx context.Context, tr *v1beta1.TaskRun) {
logger := logging.FromContext(ctx)
if tr.IsDone() {
Expand Down
56 changes: 56 additions & 0 deletions pkg/reconciler/taskrun/taskrun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1884,6 +1884,62 @@ status:
}
}

func TestReconcilePodFailures(t *testing.T) {
taskRun := parse.MustParseTaskRun(t, `
metadata:
name: test-imagepull-fail
namespace: foo
spec:
taskSpec:
steps:
- image: whatever
status:
steps:
- container: step-unnamed-0
name: unnamed-0
waiting:
message: Back-off pulling image "whatever"
reason: ImagePullBackOff
taskSpec:
steps:
- image: whatever
`)
expectedStatus := &apis.Condition{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionFalse,
Reason: "TaskRunImagePullFailed",
Message: `A step in TaskRun "test-imagepull-fail" failed to pull the image. The pod errored with the message: "Back-off pulling image "whatever"."`,
}

wantEvents := []string{
"Normal Started ",
`Warning Failed A step in TaskRun "test-imagepull-fail" failed to pull the image. The pod errored with the message: "Back-off pulling image "whatever".`,
}
d := test.Data{
TaskRuns: []*v1beta1.TaskRun{taskRun},
}
testAssets, cancel := getTaskRunController(t, d)
defer cancel()
c := testAssets.Controller
clients := testAssets.Clients

if err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(taskRun)); err != nil {
t.Fatalf("Unexpected error when reconciling completed TaskRun : %v", err)
}
newTr, err := clients.Pipeline.TektonV1beta1().TaskRuns(taskRun.Namespace).Get(testAssets.Ctx, taskRun.Name, metav1.GetOptions{})
if err != nil {
t.Fatalf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", taskRun.Name, err)
}
condition := newTr.Status.GetCondition(apis.ConditionSucceeded)
if d := cmp.Diff(expectedStatus, condition, ignoreLastTransitionTime); d != "" {
t.Fatalf("Did not get expected condition %s", diff.PrintWantGot(d))
}
err = eventstest.CheckEventsOrdered(t, testAssets.Recorder.Events, taskRun.Name, wantEvents)
if err != nil {
t.Errorf(err.Error())
}
}

func TestReconcileTimeouts(t *testing.T) {
type testCase struct {
name string
Expand Down

0 comments on commit 4621a66

Please sign in to comment.