From 2a00fc5912833e80a452210e22d31a80398d2d3b Mon Sep 17 00:00:00 2001 From: wyzhang Date: Wed, 1 Apr 2020 11:17:59 -0700 Subject: [PATCH] Remove "activeDeadlineSeconds" setting on pre-/post- upgrade job. Why: Currently we set both backoffLimit and activeDeadlineSeconds on pre-/post- upgrade job. A job's activeDeadlineSeconds takes precedence over backoffLimit, thus pre-/post- upgrade job will not deploy additional pods once it reaches the time limit even if the backoffLimit is not hit. When this happens, the following condition is set on the job, but we are not checking that condition in version upgrade logic, thus leading to operator stucks in upgrade inprogress state. conditions: - lastProbeTime: "2020-04-01T09:48:21Z" lastTransitionTime: "2020-04-01T09:48:21Z" message: Job was active longer than specified deadline reason: DeadlineExceeded status: "True" type: Failed failed: 5 startTime: "2020-04-01T09:36:36Z" What: this change is to remove deadline seconds, thus simplify the logic to cap the retries solely based on counts. If we ever needed to have both time and count limits, then adding checking for deadline exceeded condition in upgrade logic is required. --- controllers/testdata/post_upgrade_job.json | 1 - controllers/testdata/pre_upgrade_job.json | 1 - controllers/version_update.go | 10 ++++++++++ templates/upgrade-job.yaml | 1 - 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/controllers/testdata/post_upgrade_job.json b/controllers/testdata/post_upgrade_job.json index 376bbc54..c46198f7 100644 --- a/controllers/testdata/post_upgrade_job.json +++ b/controllers/testdata/post_upgrade_job.json @@ -34,7 +34,6 @@ "uid": "a0afd92d-4b0d-11ea-8611-42010a800022" }, "spec": { - "activeDeadlineSeconds": 600, "backoffLimit": 10, "completions": 1, "parallelism": 1, diff --git a/controllers/testdata/pre_upgrade_job.json b/controllers/testdata/pre_upgrade_job.json index 1e13e6a9..8751b84b 100644 --- a/controllers/testdata/pre_upgrade_job.json +++ b/controllers/testdata/pre_upgrade_job.json @@ -34,7 +34,6 @@ "uid": "401828eb-4b1a-11ea-8611-42010a800022" }, "spec": { - "activeDeadlineSeconds": 600, "backoffLimit": 10, "completions": 1, "parallelism": 1, diff --git a/controllers/version_update.go b/controllers/version_update.go index 1523fa11..1f1381f8 100644 --- a/controllers/version_update.go +++ b/controllers/version_update.go @@ -154,6 +154,11 @@ func upgradeForBackend(master *v1alpha1.CDAPMaster, labels map[string]string, ob } // First, run pre-upgrade job + // + // Note that pre-upgrade job doesn't have an "activeDeadlineSeconds" set it on, so it will + // try as many as imageVersionUpgradeJobMaxRetryCount times before giving up. If we ever + // needed to set an overall deadline for the pre-upgrade job, the logic below needs to check + // deadline exceeded condition on job's status if !isConditionTrue(master, updateStatus.PreUpgradeSucceeded) { log.Printf("Version update: pre-upgrade job not completed") preJobName := getPreUpgradeJobName(master.Status.UpgradeStartTimeMillis) @@ -192,6 +197,11 @@ func upgradeForBackend(master *v1alpha1.CDAPMaster, labels map[string]string, ob } // At last, run post-upgrade job + // + // Note that post-upgrade job doesn't have an "activeDeadlineSeconds" set it on, so it will + // try as many as imageVersionUpgradeJobMaxRetryCount times before giving up. If we ever + // needed to set an overall deadline for the post-upgrade job, the logic below needs to check + // deadline exceeded condition on job's status if !isConditionTrue(master, updateStatus.PostUpgradeSucceeded) { log.Printf("Version update: post-upgrade job not completed") postJobName := getPostUpgradeJobName(master.Status.UpgradeStartTimeMillis) diff --git a/templates/upgrade-job.yaml b/templates/upgrade-job.yaml index 561c2e7d..d0e7404b 100644 --- a/templates/upgrade-job.yaml +++ b/templates/upgrade-job.yaml @@ -74,4 +74,3 @@ spec: {{end}} restartPolicy: Never backoffLimit: {{.BackoffLimit}} - activeDeadlineSeconds: 600