From b19b01b2851ddd83ac89884f4e34968ce9dfcec7 Mon Sep 17 00:00:00 2001 From: RidRisR <79858083+RidRisR@users.noreply.github.com> Date: Tue, 7 Jan 2025 04:37:26 +0100 Subject: [PATCH] infinate backoff for create job --- pkg/controller/compact_status_updater.go | 9 +-- .../compact_backup_controller.go | 63 ++++++++++--------- 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/pkg/controller/compact_status_updater.go b/pkg/controller/compact_status_updater.go index c084dd3277..a602094518 100644 --- a/pkg/controller/compact_status_updater.go +++ b/pkg/controller/compact_status_updater.go @@ -146,15 +146,8 @@ func (r *CompactStatusUpdater) OnCreateJob(ctx context.Context, compact *v1alpha if err != nil { newStatus.State = string(v1alpha1.BackupRetryTheFailed) newStatus.Message = err.Error() - newStatus.RetryStatus = []v1alpha1.CompactRetryRecord{ - { - RetryNum: len(compact.Status.RetryStatus), - DetectFailedAt: metav1.NewTime(time.Now()), - RetryReason: err.Error(), - }, - } } else { - newStatus.State = string(v1alpha1.BackupRunning) + newStatus.State = string(v1alpha1.BackupPrepare) } return r.UpdateStatus(compact, newStatus) } diff --git a/pkg/controller/compactbackup/compact_backup_controller.go b/pkg/controller/compactbackup/compact_backup_controller.go index 14f95cd88a..f60e020b8f 100644 --- a/pkg/controller/compactbackup/compact_backup_controller.go +++ b/pkg/controller/compactbackup/compact_backup_controller.go @@ -42,6 +42,10 @@ import ( "k8s.io/utils/ptr" ) +const ( + maxInterval = 6 * time.Minute +) + // Controller controls backup. type Controller struct { deps *controller.Dependencies @@ -281,21 +285,21 @@ func (c *Controller) sync(key string) (err error) { return nil } - ok, err := c.precheckCompact(compact) + ok, err := c.checkJobStatus(compact) if err != nil { return err } if !ok { - klog.Infof("Compact %s/%s is not allowed, skip", ns, name) + klog.Infof("Compact %s/%s is not allowed to create job, skip", ns, name) return nil } - err = c.doCompact(compact.DeepCopy()) + err = c.createJob(compact.DeepCopy()) c.statusUpdater.OnCreateJob(context.TODO(), compact, err) return err } -func (c *Controller) doCompact(compact *v1alpha1.CompactBackup) error { +func (c *Controller) createJob(compact *v1alpha1.CompactBackup) error { ns := compact.GetNamespace() name := compact.GetName() compactJobName := compact.GetName() @@ -474,17 +478,34 @@ func (c *Controller) makeCompactJob(compact *v1alpha1.CompactBackup) (*batchv1.J return job, "", nil } -// precheckCompact checks if doCompact is allowed to run +func (c *Controller) validate(compact *v1alpha1.CompactBackup) error { + spec := compact.Spec + if spec.StartTs == "" { + return errors.NewNoStackError("start-ts must be set") + } + if spec.EndTs == "" { + return errors.NewNoStackError("end-ts must be set") + } + if spec.Concurrency <= 0 { + return errors.NewNoStackError("concurrency must be greater than 0") + } + if spec.MaxRetryTimes < 0 { + return errors.NewNoStackError("maxRetryTimes must be greater than or equal to 0") + } + return nil +} + +// checkJobStatus checks if doCompact is allowed to run // Only if there is no other compact job existing, doCompact is allowed // If the existing job failed, update compact status -func (c *Controller) precheckCompact(compact *v1alpha1.CompactBackup) (bool, error) { +func (c *Controller) checkJobStatus(compact *v1alpha1.CompactBackup) (bool, error) { ns := compact.GetNamespace() name := compact.GetName() job, err := c.deps.KubeClientset.BatchV1().Jobs(ns).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { if errors.IsNotFound(err) { - return c.allowCompact(compact), nil + return c.allowCreateJob(compact), nil } klog.Errorf("Failed to get job %s for compact %s/%s, error %v", name, ns, name, err) return false, err @@ -509,24 +530,7 @@ func (c *Controller) precheckCompact(compact *v1alpha1.CompactBackup) (bool, err return false, nil } -func (c *Controller) validate(compact *v1alpha1.CompactBackup) error { - spec := compact.Spec - if spec.StartTs == "" { - return errors.NewNoStackError("start-ts must be set") - } - if spec.EndTs == "" { - return errors.NewNoStackError("end-ts must be set") - } - if spec.Concurrency <= 0 { - return errors.NewNoStackError("concurrency must be greater than 0") - } - if spec.MaxRetryTimes < 0 { - return errors.NewNoStackError("maxRetryTimes must be greater than or equal to 0") - } - return nil -} - -func (c *Controller) allowCompact(compact *v1alpha1.CompactBackup) bool { +func (c *Controller) allowCreateJob(compact *v1alpha1.CompactBackup) bool { ns := compact.GetNamespace() name := compact.GetName() @@ -535,15 +539,16 @@ func (c *Controller) allowCompact(compact *v1alpha1.CompactBackup) bool { if attempts <= 1 { return 0 } - return 10 * time.Duration(math.Pow(10, float64(attempts-1))) * time.Second + interval := 10 * time.Duration(math.Pow(10, float64(attempts-1))) * time.Second + if interval > maxInterval { + return maxInterval + } + return interval } attempts := len(compact.Status.RetryStatus) if attempts > 0 { lastRetry := compact.Status.RetryStatus[attempts-1] - if lastRetry.RetryNum >= int(compact.Spec.MaxRetryTimes) { - return false - } backoff := expBackoff(attempts) if time.Since(lastRetry.DetectFailedAt.Time) < backoff { klog.Infof("Compact: [%s/%s] backoff in effect, skipping retry.", ns, name)