Skip to content

Commit

Permalink
feat: add in termination check for nodes without machines (#252)
Browse files Browse the repository at this point in the history
* feat: add in termination check for nodes without machines

* reorganize code

* change controller requeue logic

* fix ci test flake
  • Loading branch information
njtran authored Mar 31, 2023
1 parent 4bb544e commit 27ccd92
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 15 deletions.
6 changes: 3 additions & 3 deletions pkg/controllers/machine/garbagecollect/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ var _ = Describe("GarbageCollection", func() {
machine = ExpectExists(ctx, env.Client, machine)

// Step forward to move past the cache eventual consistency timeout
fakeClock.Step(time.Second * 20)
fakeClock.SetTime(time.Now().Add(time.Second * 20))

// Delete the machine from the cloudprovider
Expect(cloudProvider.Delete(ctx, machine)).To(Succeed())
Expand All @@ -132,7 +132,7 @@ var _ = Describe("GarbageCollection", func() {
}

// Step forward to move past the cache eventual consistency timeout
fakeClock.Step(time.Second * 20)
fakeClock.SetTime(time.Now().Add(time.Second * 20))

for _, machine := range machines {
// Delete the machine from the cloudprovider
Expand Down Expand Up @@ -160,7 +160,7 @@ var _ = Describe("GarbageCollection", func() {
machine = ExpectExists(ctx, env.Client, machine)

// Step forward to move past the cache eventual consistency timeout
fakeClock.Step(time.Second * 20)
fakeClock.SetTime(time.Now().Add(time.Second * 20))

// Reconcile the Machine. It should not be deleted by this flow since it has never been registered
ExpectReconcileSucceeded(ctx, garbageCollectionController, client.ObjectKey{})
Expand Down
32 changes: 22 additions & 10 deletions pkg/controllers/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,24 +77,36 @@ func (c *Controller) Finalize(ctx context.Context, node *v1.Node) (reconcile.Res
return reconcile.Result{}, fmt.Errorf("cordoning node, %w", err)
}
if err := c.terminator.Drain(ctx, node); err != nil {
if terminator.IsNodeDrainError(err) {
c.recorder.Publish(terminatorevents.NodeFailedToDrain(node, err))
return reconcile.Result{Requeue: true}, nil
if !terminator.IsNodeDrainError(err) {
return reconcile.Result{}, fmt.Errorf("draining node, %w", err)
}
return reconcile.Result{}, fmt.Errorf("draining node, %w", err)
c.recorder.Publish(terminatorevents.NodeFailedToDrain(node, err))
// If the underlying machine no longer exists.
if _, err := c.cloudProvider.Get(ctx, node.Spec.ProviderID); err != nil {
if cloudprovider.IsMachineNotFoundError(err) {
return reconcile.Result{}, c.removeFinalizer(ctx, node)
}
return reconcile.Result{}, fmt.Errorf("getting machine, %w", err)
}
return reconcile.Result{RequeueAfter: 1 * time.Second}, nil
}

if err := c.cloudProvider.Delete(ctx, machineutil.NewFromNode(node)); cloudprovider.IgnoreMachineNotFoundError(err) != nil {
return reconcile.Result{}, fmt.Errorf("terminating cloudprovider instance, %w", err)
}
stored := node.DeepCopy()
controllerutil.RemoveFinalizer(node, v1alpha5.TerminationFinalizer)
if !equality.Semantic.DeepEqual(stored, node) {
if err := c.kubeClient.Patch(ctx, node, client.MergeFrom(stored)); err != nil {
return reconcile.Result{}, client.IgnoreNotFound(err)
return reconcile.Result{}, c.removeFinalizer(ctx, node)
}

func (c *Controller) removeFinalizer(ctx context.Context, n *v1.Node) error {
stored := n.DeepCopy()
controllerutil.RemoveFinalizer(n, v1alpha5.TerminationFinalizer)
if !equality.Semantic.DeepEqual(stored, n) {
if err := c.kubeClient.Patch(ctx, n, client.MergeFrom(stored)); err != nil {
return client.IgnoreNotFound(fmt.Errorf("patching node, %w", err))
}
logging.FromContext(ctx).Infof("deleted node")
}
return reconcile.Result{}, nil
return nil
}

func (c *Controller) Builder(ctx context.Context, m manager.Manager) corecontroller.Builder {
Expand Down
39 changes: 37 additions & 2 deletions pkg/controllers/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ var evictionQueue *terminator.EvictionQueue
var env *test.Environment
var defaultOwnerRefs = []metav1.OwnerReference{{Kind: "ReplicaSet", APIVersion: "appsv1", Name: "rs", UID: "1234567890"}}
var fakeClock *clock.FakeClock
var cloudProvider *fake.CloudProvider

func TestAPIs(t *testing.T) {
ctx = TestContextWithLogger(t)
Expand All @@ -64,7 +65,7 @@ var _ = BeforeSuite(func() {
fakeClock = clock.NewFakeClock(time.Now())
env = test.NewEnvironment(scheme.Scheme, test.WithCRDs(apis.CRDs...))

cloudProvider := fake.NewCloudProvider()
cloudProvider = fake.NewCloudProvider()
evictionQueue = terminator.NewEvictionQueue(ctx, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}))
terminationController = termination.NewController(env.Client, cloudProvider, terminator.NewTerminator(fakeClock, env.Client, evictionQueue), events.NewRecorder(&record.FakeRecorder{}))
})
Expand All @@ -75,14 +76,17 @@ var _ = AfterSuite(func() {

var _ = Describe("Termination", func() {
var node *v1.Node
var machine *v1alpha5.Machine

BeforeEach(func() {
node = test.Node(test.NodeOptions{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1alpha5.TerminationFinalizer}}})
machine, node = test.MachineAndNode(v1alpha5.Machine{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1alpha5.TerminationFinalizer}}})
cloudProvider.CreatedMachines[node.Spec.ProviderID] = machine
})

AfterEach(func() {
ExpectCleanedUp(ctx, env.Client)
fakeClock.SetTime(time.Now())
cloudProvider.Reset()
})

Context("Reconciliation", func() {
Expand Down Expand Up @@ -362,6 +366,37 @@ var _ = Describe("Termination", func() {
ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node))
ExpectNotFound(ctx, env.Client, node)
})
It("should delete nodes with no underlying machine even if not fully drained", func() {
pods := []*v1.Pod{
test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}),
test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}),
}
ExpectApplied(ctx, env.Client, node, pods[0], pods[1])

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node))

// Expect the pods to be evicted
ExpectEvicted(env.Client, pods[0], pods[1])

// Expect node to exist and be draining, but not deleted
node = ExpectNodeExists(ctx, env.Client, node.Name)
ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node))
ExpectNodeDraining(env.Client, node.Name)

// After this, the node still has one pod that is evicting.
ExpectDeleted(ctx, env.Client, pods[1])

// Remove the node from created machines so that the cloud provider returns DNE
cloudProvider.CreatedMachines = map[string]*v1alpha5.Machine{}

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node))
ExpectNotFound(ctx, env.Client, node)
})
It("should wait for pods to terminate", func() {
pod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
fakeClock.SetTime(time.Now()) // make our fake clock match the pod creation time
Expand Down

0 comments on commit 27ccd92

Please sign in to comment.