From 913d0a2463cfb61de30fd5d204beef4fc0100045 Mon Sep 17 00:00:00 2001 From: Ellis Tarn Date: Tue, 11 Apr 2023 10:17:54 -0700 Subject: [PATCH] fix: Requeue immediately after deprovisioning (#279) * fix: Requeue immediately after deprovisioning * Continue after deprovisioning errors * Shortcircuit if any deprovisioner fails --- pkg/controllers/deprovisioning/controller.go | 55 +++++++++++-------- .../emptymachineconsolidation.go | 3 +- .../multimachineconsolidation.go | 2 +- .../singlemachineconsolidation.go | 11 +--- pkg/controllers/deprovisioning/suite_test.go | 6 +- pkg/controllers/deprovisioning/types.go | 4 -- 6 files changed, 40 insertions(+), 41 deletions(-) diff --git a/pkg/controllers/deprovisioning/controller.go b/pkg/controllers/deprovisioning/controller.go index a94c114866..54b72cbc4f 100644 --- a/pkg/controllers/deprovisioning/controller.go +++ b/pkg/controllers/deprovisioning/controller.go @@ -57,6 +57,7 @@ type Controller struct { // pollingPeriod that we inspect cluster to look for opportunities to deprovision const pollingPeriod = 10 * time.Second +const immediately = time.Millisecond var errCandidateDeleting = fmt.Errorf("candidate is deleting") @@ -115,32 +116,13 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc // Attempt different deprovisioning methods. We'll only let one method perform an action isConsolidated := c.cluster.Consolidated() for _, d := range c.deprovisioners { - candidates, err := GetCandidates(ctx, c.cluster, c.kubeClient, c.clock, c.cloudProvider, d.ShouldDeprovision) + success, err := c.deprovision(ctx, d) if err != nil { - return reconcile.Result{}, fmt.Errorf("determining candidates, %w", err) + return reconcile.Result{}, fmt.Errorf("deprovisioning via %q, %w", d, err) } - // If there are no candidate nodes, move to the next deprovisioner - if len(candidates) == 0 { - continue - } - - // Determine the deprovisioning action - cmd, err := d.ComputeCommand(ctx, candidates...) - if err != nil { - return reconcile.Result{}, fmt.Errorf("computing deprovisioning decision, %w", err) - } - if cmd.action == actionDoNothing { - continue + if success { + return reconcile.Result{RequeueAfter: immediately}, nil } - if cmd.action == actionRetry { - return reconcile.Result{Requeue: true}, nil - } - - // Attempt to deprovision - if err := c.executeCommand(ctx, d, cmd); err != nil { - return reconcile.Result{}, fmt.Errorf("deprovisioning candidates, %w", err) - } - return reconcile.Result{Requeue: true}, nil } if !isConsolidated { @@ -151,6 +133,33 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc return reconcile.Result{RequeueAfter: pollingPeriod}, nil } +func (c *Controller) deprovision(ctx context.Context, deprovisioner Deprovisioner) (bool, error) { + candidates, err := GetCandidates(ctx, c.cluster, c.kubeClient, c.clock, c.cloudProvider, deprovisioner.ShouldDeprovision) + if err != nil { + return false, fmt.Errorf("determining candidates, %w", err) + } + // If there are no candidate nodes, move to the next deprovisioner + if len(candidates) == 0 { + return false, nil + } + + // Determine the deprovisioning action + cmd, err := deprovisioner.ComputeCommand(ctx, candidates...) + if err != nil { + return false, fmt.Errorf("computing deprovisioning decision, %w", err) + } + if cmd.action == actionDoNothing { + return false, nil + } + + // Attempt to deprovision + if err := c.executeCommand(ctx, deprovisioner, cmd); err != nil { + return false, fmt.Errorf("deprovisioning candidates, %w", err) + } + + return true, nil +} + func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, command Command) error { deprovisioningActionsPerformedCounter.With(prometheus.Labels{"action": fmt.Sprintf("%s/%s", d, command.action)}).Add(1) logging.FromContext(ctx).Infof("deprovisioning via %s %s", d, command) diff --git a/pkg/controllers/deprovisioning/emptymachineconsolidation.go b/pkg/controllers/deprovisioning/emptymachineconsolidation.go index 5e92674601..a414858e62 100644 --- a/pkg/controllers/deprovisioning/emptymachineconsolidation.go +++ b/pkg/controllers/deprovisioning/emptymachineconsolidation.go @@ -80,9 +80,8 @@ func (c *EmptyMachineConsolidation) ComputeCommand(ctx context.Context, candidat // the machine isn't a target of a recent scheduling simulation for _, n := range candidatesToDelete { if len(n.pods) != 0 && !c.cluster.IsNodeNominated(n.Name()) { - return Command{action: actionRetry}, nil + return Command{}, fmt.Errorf("command is no longer valid, %s", cmd) } } - return cmd, nil } diff --git a/pkg/controllers/deprovisioning/multimachineconsolidation.go b/pkg/controllers/deprovisioning/multimachineconsolidation.go index f487e4bca4..8f5f19d1ee 100644 --- a/pkg/controllers/deprovisioning/multimachineconsolidation.go +++ b/pkg/controllers/deprovisioning/multimachineconsolidation.go @@ -65,7 +65,7 @@ func (m *MultiMachineConsolidation) ComputeCommand(ctx context.Context, candidat } if !isValid { - return Command{action: actionRetry}, nil + return Command{}, fmt.Errorf("command is no longer valid, %s", cmd) } return cmd, nil } diff --git a/pkg/controllers/deprovisioning/singlemachineconsolidation.go b/pkg/controllers/deprovisioning/singlemachineconsolidation.go index 3b99c7106f..074c1995d1 100644 --- a/pkg/controllers/deprovisioning/singlemachineconsolidation.go +++ b/pkg/controllers/deprovisioning/singlemachineconsolidation.go @@ -50,7 +50,6 @@ func (c *SingleMachineConsolidation) ComputeCommand(ctx context.Context, candida } v := NewValidation(consolidationTTL, c.clock, c.cluster, c.kubeClient, c.provisioner, c.cloudProvider, c.recorder) - var failedValidation bool for _, candidate := range candidates { // compute a possible consolidation option cmd, err := c.computeConsolidation(ctx, candidate) @@ -58,7 +57,7 @@ func (c *SingleMachineConsolidation) ComputeCommand(ctx context.Context, candida logging.FromContext(ctx).Errorf("computing consolidation %s", err) continue } - if cmd.action == actionDoNothing || cmd.action == actionRetry { + if cmd.action == actionDoNothing { continue } @@ -68,18 +67,12 @@ func (c *SingleMachineConsolidation) ComputeCommand(ctx context.Context, candida continue } if !isValid { - failedValidation = true - continue + return Command{}, fmt.Errorf("command is no longer valid, %s", cmd) } if cmd.action == actionReplace || cmd.action == actionDelete { return cmd, nil } } - - // we failed validation, so we need to retry - if failedValidation { - return Command{action: actionRetry}, nil - } return Command{action: actionDoNothing}, nil } diff --git a/pkg/controllers/deprovisioning/suite_test.go b/pkg/controllers/deprovisioning/suite_test.go index 6d6ea21185..7db5019e56 100644 --- a/pkg/controllers/deprovisioning/suite_test.go +++ b/pkg/controllers/deprovisioning/suite_test.go @@ -1605,7 +1605,8 @@ var _ = Describe("Consolidation TTL", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + _, err := deprovisioningController.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) }() // wait for the deprovisioningController to block on the validation timeout @@ -1757,7 +1758,8 @@ var _ = Describe("Consolidation TTL", func() { defer GinkgoRecover() defer wg.Done() defer finished.Store(true) - ExpectReconcileSucceeded(ctx, deprovisioningController, client.ObjectKey{}) + _, err := deprovisioningController.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) }() // wait for the deprovisioningController to block on the validation timeout diff --git a/pkg/controllers/deprovisioning/types.go b/pkg/controllers/deprovisioning/types.go index 57ac93f175..47e1b90dbe 100644 --- a/pkg/controllers/deprovisioning/types.go +++ b/pkg/controllers/deprovisioning/types.go @@ -127,7 +127,6 @@ type action byte const ( actionDelete action = iota actionReplace - actionRetry actionDoNothing ) @@ -139,9 +138,6 @@ func (a action) String() string { // Deprovisioning action with replacement machines case actionReplace: return "replace" - // Deprovisioning failed for a retryable reason - case actionRetry: - return "retry" case actionDoNothing: return "do nothing" default: