Skip to content

Commit

Permalink
feat: add in termination check for nodes without machines
Browse files Browse the repository at this point in the history
  • Loading branch information
njtran committed Mar 28, 2023
1 parent cb03ff7 commit a7ef719
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 5 deletions.
10 changes: 7 additions & 3 deletions pkg/controllers/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,15 @@ func (c *Controller) Finalize(ctx context.Context, node *v1.Node) (reconcile.Res
return reconcile.Result{}, fmt.Errorf("cordoning node, %w", err)
}
if err := c.terminator.Drain(ctx, node); err != nil {
if terminator.IsNodeDrainError(err) {
c.recorder.Publish(terminatorevents.NodeFailedToDrain(node, err))
if !terminator.IsNodeDrainError(err) {
return reconcile.Result{}, fmt.Errorf("draining node, %w", err)
}
c.recorder.Publish(terminatorevents.NodeFailedToDrain(node, err))
// If there are pods that cannot be drained, fetch the underlying machine.
// If the underlying machine no longer exists, force terminate the node.
if _, err := c.cloudProvider.Get(ctx, node.Spec.ProviderID); !cloudprovider.IsMachineNotFoundError(err) {
return reconcile.Result{Requeue: true}, nil
}
return reconcile.Result{}, fmt.Errorf("draining node, %w", err)
}
if err := c.cloudProvider.Delete(ctx, machineutil.NewFromNode(node)); cloudprovider.IgnoreMachineNotFoundError(err) != nil {
return reconcile.Result{}, fmt.Errorf("terminating cloudprovider instance, %w", err)
Expand Down
39 changes: 37 additions & 2 deletions pkg/controllers/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ var evictionQueue *terminator.EvictionQueue
var env *test.Environment
var defaultOwnerRefs = []metav1.OwnerReference{{Kind: "ReplicaSet", APIVersion: "appsv1", Name: "rs", UID: "1234567890"}}
var fakeClock *clock.FakeClock
var cloudProvider *fake.CloudProvider

func TestAPIs(t *testing.T) {
ctx = TestContextWithLogger(t)
Expand All @@ -64,7 +65,7 @@ var _ = BeforeSuite(func() {
fakeClock = clock.NewFakeClock(time.Now())
env = test.NewEnvironment(scheme.Scheme, test.WithCRDs(apis.CRDs...))

cloudProvider := fake.NewCloudProvider()
cloudProvider = fake.NewCloudProvider()
evictionQueue = terminator.NewEvictionQueue(ctx, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}))
terminationController = termination.NewController(env.Client, cloudProvider, terminator.NewTerminator(fakeClock, env.Client, evictionQueue), events.NewRecorder(&record.FakeRecorder{}))
})
Expand All @@ -75,14 +76,17 @@ var _ = AfterSuite(func() {

var _ = Describe("Termination", func() {
var node *v1.Node
var machine *v1alpha5.Machine

BeforeEach(func() {
node = test.Node(test.NodeOptions{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1alpha5.TerminationFinalizer}}})
machine, node = test.MachineAndNode(v1alpha5.Machine{ObjectMeta: metav1.ObjectMeta{Finalizers: []string{v1alpha5.TerminationFinalizer}}})
cloudProvider.CreatedMachines[node.Spec.ProviderID] = machine
})

AfterEach(func() {
ExpectCleanedUp(ctx, env.Client)
fakeClock.SetTime(time.Now())
cloudProvider.Reset()
})

Context("Reconciliation", func() {
Expand Down Expand Up @@ -362,6 +366,37 @@ var _ = Describe("Termination", func() {
ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node))
ExpectNotFound(ctx, env.Client, node)
})
It("should delete nodes with no underlying machine even if not fully drained", func() {
pods := []*v1.Pod{
test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}),
test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}}),
}
ExpectApplied(ctx, env.Client, node, pods[0], pods[1])

// Trigger Termination Controller
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node))

// Expect the pods to be evicted
ExpectEvicted(env.Client, pods[0], pods[1])

// Expect node to exist and be draining, but not deleted
node = ExpectNodeExists(ctx, env.Client, node.Name)
ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node))
ExpectNodeDraining(env.Client, node.Name)

// After this, the node still has one pod that is evicting.
ExpectDeleted(ctx, env.Client, pods[1])

// Remove the node from created machines so that the cloud provider returns DNE
cloudProvider.CreatedMachines = map[string]*v1alpha5.Machine{}

// Reconcile to delete node
node = ExpectNodeExists(ctx, env.Client, node.Name)
ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node))
ExpectNotFound(ctx, env.Client, node)
})
It("should wait for pods to terminate", func() {
pod := test.Pod(test.PodOptions{NodeName: node.Name, ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}})
fakeClock.SetTime(time.Now()) // make our fake clock match the pod creation time
Expand Down

0 comments on commit a7ef719

Please sign in to comment.