From 6fdbc9ba13cca97dbf94974ce333df6efdd2b4a7 Mon Sep 17 00:00:00 2001 From: elankath Date: Tue, 1 Aug 2023 12:01:03 +0530 Subject: [PATCH 01/11] dont skip drain for unhealthy nodes --- docs/documents/apis.md | 15 ++ ...achine.sapcloud.io_machinedeployments.yaml | 5 + .../crds/machine.sapcloud.io_machines.yaml | 5 + .../crds/machine.sapcloud.io_machinesets.yaml | 10 ++ pkg/apis/machine/types.go | 3 + pkg/apis/machine/v1alpha1/machine_types.go | 5 +- .../v1alpha1/zz_generated.conversion.go | 2 + .../machine/v1alpha1/zz_generated.deepcopy.go | 1 + pkg/apis/machine/zz_generated.deepcopy.go | 1 + pkg/openapi/openapi_generated.go | 7 + pkg/util/provider/drain/drain.go | 7 +- .../provider/machinecontroller/machine.go | 18 ++- .../machinecontroller/machine_test.go | 41 +++--- .../machinecontroller/machine_util.go | 139 ++++++++++++++++-- pkg/util/provider/machineutils/utils.go | 3 + pkg/util/provider/metrics/metrics.go | 2 +- 16 files changed, 219 insertions(+), 45 deletions(-) diff --git a/docs/documents/apis.md b/docs/documents/apis.md index 393539385..00ab4b681 100644 --- a/docs/documents/apis.md +++ b/docs/documents/apis.md @@ -914,6 +914,21 @@ MachineOperationType

Type of operation

+ + +lastStateTransitionTime + + + + +Kubernetes meta/v1.Time + + + + +

LastStateTransitionTime represents the Last Operation State Transition Time.

+ +
diff --git a/kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml b/kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml index efa51054d..f17f9961d 100644 --- a/kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml +++ b/kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml @@ -431,6 +431,11 @@ spec: description: description: Description of the current operation type: string + lastStateTransitionTime: + description: LastStateTransitionTime represents the Last + Operation State Transition Time. + format: date-time + type: string lastUpdateTime: description: Last update time of current operation format: date-time diff --git a/kubernetes/crds/machine.sapcloud.io_machines.yaml b/kubernetes/crds/machine.sapcloud.io_machines.yaml index 651b2b9cf..511c56e27 100644 --- a/kubernetes/crds/machine.sapcloud.io_machines.yaml +++ b/kubernetes/crds/machine.sapcloud.io_machines.yaml @@ -263,6 +263,11 @@ spec: description: description: Description of the current operation type: string + lastStateTransitionTime: + description: LastStateTransitionTime represents the Last Operation + State Transition Time. + format: date-time + type: string lastUpdateTime: description: Last update time of current operation format: date-time diff --git a/kubernetes/crds/machine.sapcloud.io_machinesets.yaml b/kubernetes/crds/machine.sapcloud.io_machinesets.yaml index 4f362a048..7387da025 100644 --- a/kubernetes/crds/machine.sapcloud.io_machinesets.yaml +++ b/kubernetes/crds/machine.sapcloud.io_machinesets.yaml @@ -313,6 +313,11 @@ spec: description: description: Description of the current operation type: string + lastStateTransitionTime: + description: LastStateTransitionTime represents the Last + Operation State Transition Time. + format: date-time + type: string lastUpdateTime: description: Last update time of current operation format: date-time @@ -347,6 +352,11 @@ spec: description: description: Description of the current operation type: string + lastStateTransitionTime: + description: LastStateTransitionTime represents the Last Operation + State Transition Time. + format: date-time + type: string lastUpdateTime: description: Last update time of current operation format: date-time diff --git a/pkg/apis/machine/types.go b/pkg/apis/machine/types.go index 44a01719d..71bcf9122 100644 --- a/pkg/apis/machine/types.go +++ b/pkg/apis/machine/types.go @@ -195,6 +195,9 @@ type LastOperation struct { // Type of operation Type MachineOperationType + + // LastStateTransitionTime represents the Last Operation State Transition Time. + LastStateTransitionTime metav1.Time } // MachinePhase is a label for the condition of a machines at the current time. diff --git a/pkg/apis/machine/v1alpha1/machine_types.go b/pkg/apis/machine/v1alpha1/machine_types.go index 1051f31de..1f601e9d3 100644 --- a/pkg/apis/machine/v1alpha1/machine_types.go +++ b/pkg/apis/machine/v1alpha1/machine_types.go @@ -130,6 +130,9 @@ type LastOperation struct { // Type of operation Type MachineOperationType `json:"type,omitempty"` + + // LastStateTransitionTime represents the Last Operation State Transition Time. + LastStateTransitionTime metav1.Time `json:"lastStateTransitionTime,omitempty"` } // MachinePhase is a label for the condition of a machines at the current time. @@ -188,7 +191,7 @@ const ( // MachineOperationHealthCheck indicates that the operation was a create MachineOperationHealthCheck MachineOperationType = "HealthCheck" - // MachineOperationDelete indicates that the operation was a create + // MachineOperationDelete indicates that the operation was a delete MachineOperationDelete MachineOperationType = "Delete" ) diff --git a/pkg/apis/machine/v1alpha1/zz_generated.conversion.go b/pkg/apis/machine/v1alpha1/zz_generated.conversion.go index 5b027828d..c5ab0bbc1 100644 --- a/pkg/apis/machine/v1alpha1/zz_generated.conversion.go +++ b/pkg/apis/machine/v1alpha1/zz_generated.conversion.go @@ -365,6 +365,7 @@ func autoConvert_v1alpha1_LastOperation_To_machine_LastOperation(in *LastOperati out.LastUpdateTime = in.LastUpdateTime out.State = machine.MachineState(in.State) out.Type = machine.MachineOperationType(in.Type) + out.LastStateTransitionTime = in.LastStateTransitionTime return nil } @@ -378,6 +379,7 @@ func autoConvert_machine_LastOperation_To_v1alpha1_LastOperation(in *machine.Las out.LastUpdateTime = in.LastUpdateTime out.State = MachineState(in.State) out.Type = MachineOperationType(in.Type) + out.LastStateTransitionTime = in.LastStateTransitionTime return nil } diff --git a/pkg/apis/machine/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/machine/v1alpha1/zz_generated.deepcopy.go index 35b368e1a..a11f3c542 100644 --- a/pkg/apis/machine/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/machine/v1alpha1/zz_generated.deepcopy.go @@ -65,6 +65,7 @@ func (in *CurrentStatus) DeepCopy() *CurrentStatus { func (in *LastOperation) DeepCopyInto(out *LastOperation) { *out = *in in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) + in.LastStateTransitionTime.DeepCopyInto(&out.LastStateTransitionTime) return } diff --git a/pkg/apis/machine/zz_generated.deepcopy.go b/pkg/apis/machine/zz_generated.deepcopy.go index e9bb94643..fcd9752cb 100644 --- a/pkg/apis/machine/zz_generated.deepcopy.go +++ b/pkg/apis/machine/zz_generated.deepcopy.go @@ -65,6 +65,7 @@ func (in *CurrentStatus) DeepCopy() *CurrentStatus { func (in *LastOperation) DeepCopyInto(out *LastOperation) { *out = *in in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) + in.LastStateTransitionTime.DeepCopyInto(&out.LastStateTransitionTime) return } diff --git a/pkg/openapi/openapi_generated.go b/pkg/openapi/openapi_generated.go index 0667336c2..501227f8a 100644 --- a/pkg/openapi/openapi_generated.go +++ b/pkg/openapi/openapi_generated.go @@ -434,6 +434,13 @@ func schema_pkg_apis_machine_v1alpha1_LastOperation(ref common.ReferenceCallback Format: "", }, }, + "lastStateTransitionTime": { + SchemaProps: spec.SchemaProps{ + Description: "LastStateTransitionTime represents the Last Operation State Transition Time.", + Default: map[string]interface{}{}, + Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Time"), + }, + }, }, }, }, diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index 0b5a9f6e4..db3622545 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -33,8 +33,6 @@ import ( "time" "github.com/Masterminds/semver" - "github.com/gardener/machine-controller-manager/pkg/util/k8sutils" - "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" policyv1beta1 "k8s.io/api/policy/v1beta1" @@ -49,6 +47,9 @@ import ( policyv1listers "k8s.io/client-go/listers/policy/v1" policyv1beta1listers "k8s.io/client-go/listers/policy/v1beta1" "k8s.io/klog/v2" + + "github.com/gardener/machine-controller-manager/pkg/util/k8sutils" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" ) // Options are configurable options while draining a node before deletion @@ -211,7 +212,9 @@ func (o *Options) RunDrain(ctx context.Context) error { } err := o.deleteOrEvictPodsSimple(ctx) + return err + } func (o *Options) deleteOrEvictPodsSimple(ctx context.Context) error { diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 7435a3828..52703f739 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -24,13 +24,6 @@ import ( "strings" "time" - machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine" - "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" - "github.com/gardener/machine-controller-manager/pkg/apis/machine/validation" - "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" - "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" - "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" - "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -39,6 +32,14 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" + + machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/validation" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" ) /* @@ -592,6 +593,9 @@ func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineReque case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateDrain): return c.drainNode(ctx, deleteMachineRequest) + case strings.Contains(machine.Status.LastOperation.Description, machineutils.DelVolumesAttachmentsAndWaitForDetach): + return c.deleteNodeVolAttachmentsAndWaitForDetach(ctx, deleteMachineRequest) + case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateVMDeletion): return c.deleteVM(ctx, deleteMachineRequest) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 9204628cb..c8f70e511 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -21,15 +21,6 @@ import ( "math" "time" - machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine" - "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" - "github.com/gardener/machine-controller-manager/pkg/apis/machine/validation" - fakemachineapi "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1/fake" - customfake "github.com/gardener/machine-controller-manager/pkg/fakeclient" - "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" - "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" - "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" - "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" . "github.com/onsi/ginkgo" . "github.com/onsi/ginkgo/extensions/table" . "github.com/onsi/gomega" @@ -39,6 +30,16 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/validation/field" k8stesting "k8s.io/client-go/testing" + + machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" + "github.com/gardener/machine-controller-manager/pkg/apis/machine/validation" + fakemachineapi "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1/fake" + customfake "github.com/gardener/machine-controller-manager/pkg/fakeclient" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" + "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" ) const testNamespace = "test" @@ -1631,7 +1632,7 @@ var _ = Describe("machine", func() { ), }, }), - Entry("Drain skipping as machine is NotReady for a long time (5 minutes)", &data{ + Entry("Force Drain as machine is NotReady for a long time (5 minutes)", &data{ setup: setup{ secrets: []*corev1.Secret{ { @@ -1696,7 +1697,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1715,7 +1716,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -1733,7 +1734,7 @@ var _ = Describe("machine", func() { ), }, }), - Entry("Drain skipping as machine is in ReadonlyFilesystem for a long time (5 minutes)", &data{ + Entry("Force Drain as machine is in ReadonlyFilesystem for a long time (5 minutes)", &data{ setup: setup{ secrets: []*corev1.Secret{ { @@ -1798,7 +1799,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Skipping drain as machine is in ReadonlyFilesystem for over 5minutes. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1817,7 +1818,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Skipping drain as machine is in ReadonlyFilesystem for over 5minutes. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -1835,7 +1836,7 @@ var _ = Describe("machine", func() { ), }, }), - Entry("Drain skipping as machine is NotReady for a long time(5 min) ,also ReadonlyFilesystem is true for a long time (5 minutes)", &data{ + Entry("Force Drain as machine is NotReady for a long time(5 min) ,also ReadonlyFilesystem is true for a long time (5 minutes)", &data{ setup: setup{ secrets: []*corev1.Secret{ { @@ -1905,7 +1906,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1924,7 +1925,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -2236,7 +2237,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -2450,7 +2451,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 0fb700da7..402588701 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -45,6 +45,7 @@ import ( utiltime "github.com/gardener/machine-controller-manager/pkg/util/time" v1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -52,6 +53,8 @@ import ( "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" + storageclient "k8s.io/client-go/kubernetes/typed/storage/v1" + storagelisters "k8s.io/client-go/listers/storage/v1" "k8s.io/klog/v2" ) @@ -526,6 +529,11 @@ func (c *controller) machineStatusUpdate( return nil } + if lastOperation.LastStateTransitionTime.IsZero() { + // preserve the last op state transition time if un-specified + lastOperation.LastStateTransitionTime = machine.Status.LastOperation.LastStateTransitionTime + } + _, err := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) if err != nil { // Keep retrying until update goes through @@ -1009,7 +1017,6 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver printLogInitError(message, &err, &description, machine) skipDrain = true } else { - for _, condition := range machine.Status.Conditions { if condition.Type == v1.NodeReady { nodeReadyCondition = condition @@ -1018,16 +1025,17 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } + klog.V(3).Infof("(drainNode) HOWDY. nodeReadyCondition: %s, readOnlyFileSystemCondition: %s", nodeReadyCondition, readOnlyFileSystemCondition) if !isConditionEmpty(nodeReadyCondition) && (nodeReadyCondition.Status != v1.ConditionTrue) && (time.Since(nodeReadyCondition.LastTransitionTime.Time) > nodeNotReadyDuration) { - // If node is in NotReady state over 5 minutes then skip the drain - message := "Skipping drain as machine is NotReady for over 5minutes." + message := "Setting forceDeletePods & forceDeleteMachine to true for drain as machine is NotReady for over 5min" + forceDeleteMachine = true + forceDeletePods = true printLogInitError(message, &err, &description, machine) - skipDrain = true } else if !isConditionEmpty(readOnlyFileSystemCondition) && (readOnlyFileSystemCondition.Status != v1.ConditionFalse) && (time.Since(readOnlyFileSystemCondition.LastTransitionTime.Time) > nodeNotReadyDuration) { - // If node is set to ReadonlyFilesystem over 5 minutes then skip the drain - message := "Skipping drain as machine is in ReadonlyFilesystem for over 5minutes." + message := "Setting forceDeletePods & forceDeleteMachine to true for drain as machine is in ReadonlyFilesystem for over 5min" + forceDeleteMachine = true + forceDeletePods = true printLogInitError(message, &err, &description, machine) - skipDrain = true } } @@ -1106,21 +1114,27 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver c.nodeLister, c.volumeAttachmentHandler, ) + klog.V(3).Infof("(drainNode) Invoking RunDrain, forceDeleteMachine: %t, forceDeletePods: %t, timeOutDuration: %s", forceDeletePods, forceDeleteMachine, timeOutDuration) err = drainOptions.RunDrain(ctx) if err == nil { // Drain successful klog.V(2).Infof("Drain successful for machine %q ,providerID %q, backing node %q. \nBuf:%v \nErrBuf:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf) - description = fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion) + if forceDeletePods { + description = fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach) + err = fmt.Errorf(description) + } else { // regular drain already waits for vol detach and attach for another node. + description = fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion) + err = fmt.Errorf("Machine deletion in process. " + description) + } state = v1alpha1.MachineStateProcessing // Return error even when machine object is updated - err = fmt.Errorf("Machine deletion in process. " + description) } else if err != nil && forceDeleteMachine { // Drain failed on force deletion klog.Warningf("Drain failed for machine %q. However, since it's a force deletion shall continue deletion of VM. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err) - description = fmt.Sprintf("Drain failed due to - %s. However, since it's a force deletion shall continue deletion of VM. %s", err.Error(), machineutils.InitiateVMDeletion) + description = fmt.Sprintf("Drain failed due to - %s. However, since it's a force deletion shall continue deletion of VM. %s", err.Error(), machineutils.DelVolumesAttachmentsAndWaitForDetach) state = v1alpha1.MachineStateProcessing } else { klog.Warningf("Drain failed for machine %q , providerID %q ,backing node %q. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf, err) @@ -1131,14 +1145,16 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } + now := metav1.Now() c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ - Description: description, - State: state, - Type: v1alpha1.MachineOperationDelete, - LastUpdateTime: metav1.Now(), + Description: description, + State: state, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: now, + LastStateTransitionTime: now, }, // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. // This helps while computing when the drain timeout to determine if force deletion is to be triggered. @@ -1150,6 +1166,72 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver return machineutils.ShortRetry, err } +// deleteNodeVolAttachmentsAndWaitForDetach deletes VolumeAttachment(s) for a node and waits tillvolumes are detached from the node or detach timeout exceeded +// before moving to VM deletion stage. +func (c *controller) deleteNodeVolAttachmentsAndWaitForDetach(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { + var ( + description string + state v1alpha1.MachineState + machine = deleteMachineRequest.Machine + lastOp = machine.Status.LastOperation + detachTimeout = c.safetyOptions.PvDetachTimeout.Duration + nodeName = machine.Labels[v1alpha1.NodeLabelKey] + retryPeriod = machineutils.ShortRetry + ) + node, err := c.nodeLister.Get(nodeName) + if err != nil { + if !apierrors.IsNotFound(err) { + // an error other than NotFound, let us try again later. + return retryPeriod, err + } + // node not found move to vm deletion + description = fmt.Sprintf("Skipping deleteNodeVolAttachmentsAndWaitForDetach due to - %s. Moving to VM Deletion. %s", err.Error(), machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateProcessing + retryPeriod = 0 + } else if len(node.Status.VolumesAttached) == 0 { + description = fmt.Sprintf("Node Volumes for node: %s are already detached. Moving to VM Deletion. %s", nodeName, machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateProcessing + retryPeriod = 0 + } else { + liveNodeVolAttachments, err := getLiveVolumeAttachmentsForNode(c.volumeAttachementLister, nodeName) + if err != nil { + klog.Errorf("(deleteNodeVolAttachmentsAndWaitForDetach) Error obtaining VolumeAttachment(s) for node %q: ", nodeName, err) + return retryPeriod, err + } + lastOpStateTransitionTime := lastOp.LastStateTransitionTime.Time + if len(liveNodeVolAttachments) == 0 { + waitedTime := metav1.Now().Sub(lastOpStateTransitionTime) + if waitedTime < detachTimeout { + klog.V(3).Infof("(deleteNodeVolAttachmentsAndWaitForDetach) For node %q, #node.Status.VolumesAttached=%d, lastOpStateTransitionTime: %s, wait more detachTimeout: %s not expired", + nodeName, len(node.Status.VolumesAttached), lastOpStateTransitionTime, detachTimeout) + return retryPeriod, nil + } + description = fmt.Sprintf("(deleteNodeVolAttachmentsAndWaitForDetach) Timeout: %s expired. Moving to VM Deletion. %s", detachTimeout, machineutils.InitiateVMDeletion) + state = v1alpha1.MachineStateProcessing + retryPeriod = 0 + } else { + err = deleteVolumeAttachmentsForNode(ctx, c.targetCoreClient.StorageV1().VolumeAttachments(), nodeName, liveNodeVolAttachments) + return retryPeriod, nil + } + } + now := metav1.Now() + klog.V(4).Infof("(deleteVolumeAttachmentsForNode) For node %q, set LastOperation.Description: %q", nodeName, description) + err = c.machineStatusUpdate( + ctx, + machine, + v1alpha1.LastOperation{ + Description: description, + State: state, + Type: machine.Status.LastOperation.Type, + LastUpdateTime: now, + LastStateTransitionTime: now, + }, + machine.Status.CurrentStatus, + machine.Status.LastKnownState, + ) + return retryPeriod, err +} + // deleteVM attempts to delete the VM backed by the machine object func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( @@ -1508,6 +1590,35 @@ func (c *controller) tryMarkingMachineFailed(ctx context.Context, machine, clone return machineutils.ShortRetry, err } +func getLiveVolumeAttachmentsForNode(volAttachLister storagelisters.VolumeAttachmentLister, nodeName string) ([]*storagev1.VolumeAttachment, error) { + volAttachments, err := volAttachLister.List(labels.NewSelector()) + if err != nil { + return nil, fmt.Errorf("cant list volume attachments for node %q: %w", nodeName, err) + } + nodeVolAttachments := make([]*storagev1.VolumeAttachment, 0, len(volAttachments)) + for _, va := range volAttachments { + if va.Spec.NodeName == nodeName && va.ObjectMeta.DeletionTimestamp == nil { + nodeVolAttachments = append(nodeVolAttachments, va) + } + } + return nodeVolAttachments, nil +} + +func deleteVolumeAttachmentsForNode(ctx context.Context, attachIf storageclient.VolumeAttachmentInterface, nodeName string, volAttachments []*storagev1.VolumeAttachment) error { + klog.V(3).Infof("(deleteVolumeAttachmentsForNode) Deleting #%d VolumeAttachment(s) for node %q", len(volAttachments), nodeName) + var errs []error + var delOpts = metav1.DeleteOptions{} + for _, va := range volAttachments { + err := attachIf.Delete(ctx, va.Name, delOpts) + if err != nil { + klog.Errorf("(deleteVolumeAttachmentsForNode) Error deleting VolumeAttachment %q for node %q: %s", va.Name, nodeName, err) + errs = append(errs, err) + } + klog.V(4).Infof("(deleteVolumeAttachmentsForNode) Deleted VolumeAttachment %q for node %q", va.Name, nodeName) + } + return errors.Join(errs...) +} + func getProviderID(machine *v1alpha1.Machine) string { return machine.Spec.ProviderID } diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index e9fe909f8..bb9bf4d38 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -30,6 +30,9 @@ const ( // InitiateDrain specifies next step as initiate node drain InitiateDrain = "Initiate node drain" + // DelVolumesAttachmentsAndWaitForDetach specifies next step as deleting volume attachments and waiting for their detachment. + DelVolumesAttachmentsAndWaitForDetach = "Delete Volume Attachments and Wait For Detach" + // InitiateVMDeletion specifies next step as initiate VM deletion InitiateVMDeletion = "Initiate VM deletion" diff --git a/pkg/util/provider/metrics/metrics.go b/pkg/util/provider/metrics/metrics.go index 6e73e6724..50033312a 100644 --- a/pkg/util/provider/metrics/metrics.go +++ b/pkg/util/provider/metrics/metrics.go @@ -113,4 +113,4 @@ func init() { registerMachineSubsystemMetrics() registerCloudAPISubsystemMetrics() registerMiscellaneousMetrics() -} \ No newline at end of file +} From 9454fadc83388d6cb970594b1ff95ec89443dacb Mon Sep 17 00:00:00 2001 From: elankath Date: Tue, 22 Aug 2023 07:49:26 +0530 Subject: [PATCH 02/11] Removed lastStateTransitionTime - undeeded for fix --- docs/documents/apis.md | 15 ------ ...achine.sapcloud.io_machinedeployments.yaml | 5 -- .../crds/machine.sapcloud.io_machines.yaml | 5 -- .../crds/machine.sapcloud.io_machinesets.yaml | 10 ---- pkg/apis/machine/types.go | 3 -- pkg/apis/machine/v1alpha1/machine_types.go | 3 -- .../v1alpha1/zz_generated.conversion.go | 2 - .../machine/v1alpha1/zz_generated.deepcopy.go | 1 - pkg/apis/machine/zz_generated.deepcopy.go | 1 - pkg/openapi/openapi_generated.go | 7 --- .../provider/machinecontroller/machine.go | 2 +- .../machinecontroller/machine_util.go | 54 +++++++------------ 12 files changed, 20 insertions(+), 88 deletions(-) diff --git a/docs/documents/apis.md b/docs/documents/apis.md index 00ab4b681..393539385 100644 --- a/docs/documents/apis.md +++ b/docs/documents/apis.md @@ -914,21 +914,6 @@ MachineOperationType

Type of operation

- - -lastStateTransitionTime - - - - -Kubernetes meta/v1.Time - - - - -

LastStateTransitionTime represents the Last Operation State Transition Time.

- -
diff --git a/kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml b/kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml index f17f9961d..efa51054d 100644 --- a/kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml +++ b/kubernetes/crds/machine.sapcloud.io_machinedeployments.yaml @@ -431,11 +431,6 @@ spec: description: description: Description of the current operation type: string - lastStateTransitionTime: - description: LastStateTransitionTime represents the Last - Operation State Transition Time. - format: date-time - type: string lastUpdateTime: description: Last update time of current operation format: date-time diff --git a/kubernetes/crds/machine.sapcloud.io_machines.yaml b/kubernetes/crds/machine.sapcloud.io_machines.yaml index 511c56e27..651b2b9cf 100644 --- a/kubernetes/crds/machine.sapcloud.io_machines.yaml +++ b/kubernetes/crds/machine.sapcloud.io_machines.yaml @@ -263,11 +263,6 @@ spec: description: description: Description of the current operation type: string - lastStateTransitionTime: - description: LastStateTransitionTime represents the Last Operation - State Transition Time. - format: date-time - type: string lastUpdateTime: description: Last update time of current operation format: date-time diff --git a/kubernetes/crds/machine.sapcloud.io_machinesets.yaml b/kubernetes/crds/machine.sapcloud.io_machinesets.yaml index 7387da025..4f362a048 100644 --- a/kubernetes/crds/machine.sapcloud.io_machinesets.yaml +++ b/kubernetes/crds/machine.sapcloud.io_machinesets.yaml @@ -313,11 +313,6 @@ spec: description: description: Description of the current operation type: string - lastStateTransitionTime: - description: LastStateTransitionTime represents the Last - Operation State Transition Time. - format: date-time - type: string lastUpdateTime: description: Last update time of current operation format: date-time @@ -352,11 +347,6 @@ spec: description: description: Description of the current operation type: string - lastStateTransitionTime: - description: LastStateTransitionTime represents the Last Operation - State Transition Time. - format: date-time - type: string lastUpdateTime: description: Last update time of current operation format: date-time diff --git a/pkg/apis/machine/types.go b/pkg/apis/machine/types.go index 71bcf9122..44a01719d 100644 --- a/pkg/apis/machine/types.go +++ b/pkg/apis/machine/types.go @@ -195,9 +195,6 @@ type LastOperation struct { // Type of operation Type MachineOperationType - - // LastStateTransitionTime represents the Last Operation State Transition Time. - LastStateTransitionTime metav1.Time } // MachinePhase is a label for the condition of a machines at the current time. diff --git a/pkg/apis/machine/v1alpha1/machine_types.go b/pkg/apis/machine/v1alpha1/machine_types.go index 1f601e9d3..7ddc6b8dc 100644 --- a/pkg/apis/machine/v1alpha1/machine_types.go +++ b/pkg/apis/machine/v1alpha1/machine_types.go @@ -130,9 +130,6 @@ type LastOperation struct { // Type of operation Type MachineOperationType `json:"type,omitempty"` - - // LastStateTransitionTime represents the Last Operation State Transition Time. - LastStateTransitionTime metav1.Time `json:"lastStateTransitionTime,omitempty"` } // MachinePhase is a label for the condition of a machines at the current time. diff --git a/pkg/apis/machine/v1alpha1/zz_generated.conversion.go b/pkg/apis/machine/v1alpha1/zz_generated.conversion.go index c5ab0bbc1..5b027828d 100644 --- a/pkg/apis/machine/v1alpha1/zz_generated.conversion.go +++ b/pkg/apis/machine/v1alpha1/zz_generated.conversion.go @@ -365,7 +365,6 @@ func autoConvert_v1alpha1_LastOperation_To_machine_LastOperation(in *LastOperati out.LastUpdateTime = in.LastUpdateTime out.State = machine.MachineState(in.State) out.Type = machine.MachineOperationType(in.Type) - out.LastStateTransitionTime = in.LastStateTransitionTime return nil } @@ -379,7 +378,6 @@ func autoConvert_machine_LastOperation_To_v1alpha1_LastOperation(in *machine.Las out.LastUpdateTime = in.LastUpdateTime out.State = MachineState(in.State) out.Type = MachineOperationType(in.Type) - out.LastStateTransitionTime = in.LastStateTransitionTime return nil } diff --git a/pkg/apis/machine/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/machine/v1alpha1/zz_generated.deepcopy.go index a11f3c542..35b368e1a 100644 --- a/pkg/apis/machine/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/machine/v1alpha1/zz_generated.deepcopy.go @@ -65,7 +65,6 @@ func (in *CurrentStatus) DeepCopy() *CurrentStatus { func (in *LastOperation) DeepCopyInto(out *LastOperation) { *out = *in in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) - in.LastStateTransitionTime.DeepCopyInto(&out.LastStateTransitionTime) return } diff --git a/pkg/apis/machine/zz_generated.deepcopy.go b/pkg/apis/machine/zz_generated.deepcopy.go index fcd9752cb..e9bb94643 100644 --- a/pkg/apis/machine/zz_generated.deepcopy.go +++ b/pkg/apis/machine/zz_generated.deepcopy.go @@ -65,7 +65,6 @@ func (in *CurrentStatus) DeepCopy() *CurrentStatus { func (in *LastOperation) DeepCopyInto(out *LastOperation) { *out = *in in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) - in.LastStateTransitionTime.DeepCopyInto(&out.LastStateTransitionTime) return } diff --git a/pkg/openapi/openapi_generated.go b/pkg/openapi/openapi_generated.go index 501227f8a..0667336c2 100644 --- a/pkg/openapi/openapi_generated.go +++ b/pkg/openapi/openapi_generated.go @@ -434,13 +434,6 @@ func schema_pkg_apis_machine_v1alpha1_LastOperation(ref common.ReferenceCallback Format: "", }, }, - "lastStateTransitionTime": { - SchemaProps: spec.SchemaProps{ - Description: "LastStateTransitionTime represents the Last Operation State Transition Time.", - Default: map[string]interface{}{}, - Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Time"), - }, - }, }, }, }, diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 52703f739..684e7a61f 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -594,7 +594,7 @@ func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineReque return c.drainNode(ctx, deleteMachineRequest) case strings.Contains(machine.Status.LastOperation.Description, machineutils.DelVolumesAttachmentsAndWaitForDetach): - return c.deleteNodeVolAttachmentsAndWaitForDetach(ctx, deleteMachineRequest) + return c.deleteNodeVolAttachments(ctx, deleteMachineRequest) case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateVMDeletion): return c.deleteVM(ctx, deleteMachineRequest) diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 402588701..e26bcd8a5 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -529,11 +529,6 @@ func (c *controller) machineStatusUpdate( return nil } - if lastOperation.LastStateTransitionTime.IsZero() { - // preserve the last op state transition time if un-specified - lastOperation.LastStateTransitionTime = machine.Status.LastOperation.LastStateTransitionTime - } - _, err := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) if err != nil { // Keep retrying until update goes through @@ -1150,11 +1145,10 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver ctx, machine, v1alpha1.LastOperation{ - Description: description, - State: state, - Type: v1alpha1.MachineOperationDelete, - LastUpdateTime: now, - LastStateTransitionTime: now, + Description: description, + State: state, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: now, }, // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. // This helps while computing when the drain timeout to determine if force deletion is to be triggered. @@ -1166,17 +1160,15 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver return machineutils.ShortRetry, err } -// deleteNodeVolAttachmentsAndWaitForDetach deletes VolumeAttachment(s) for a node and waits tillvolumes are detached from the node or detach timeout exceeded +// deleteNodeVolAttachments deletes VolumeAttachment(s) for a node and waits tillvolumes are detached from the node or detach timeout exceeded // before moving to VM deletion stage. -func (c *controller) deleteNodeVolAttachmentsAndWaitForDetach(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { +func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( - description string - state v1alpha1.MachineState - machine = deleteMachineRequest.Machine - lastOp = machine.Status.LastOperation - detachTimeout = c.safetyOptions.PvDetachTimeout.Duration - nodeName = machine.Labels[v1alpha1.NodeLabelKey] - retryPeriod = machineutils.ShortRetry + description string + state v1alpha1.MachineState + machine = deleteMachineRequest.Machine + nodeName = machine.Labels[v1alpha1.NodeLabelKey] + retryPeriod = machineutils.ShortRetry ) node, err := c.nodeLister.Get(nodeName) if err != nil { @@ -1185,7 +1177,7 @@ func (c *controller) deleteNodeVolAttachmentsAndWaitForDetach(ctx context.Contex return retryPeriod, err } // node not found move to vm deletion - description = fmt.Sprintf("Skipping deleteNodeVolAttachmentsAndWaitForDetach due to - %s. Moving to VM Deletion. %s", err.Error(), machineutils.InitiateVMDeletion) + description = fmt.Sprintf("Skipping deleteNodeVolAttachments due to - %s. Moving to VM Deletion. %s", err.Error(), machineutils.InitiateVMDeletion) state = v1alpha1.MachineStateProcessing retryPeriod = 0 } else if len(node.Status.VolumesAttached) == 0 { @@ -1193,22 +1185,15 @@ func (c *controller) deleteNodeVolAttachmentsAndWaitForDetach(ctx context.Contex state = v1alpha1.MachineStateProcessing retryPeriod = 0 } else { + // case: where node.Status.VolumesAttached > 0 liveNodeVolAttachments, err := getLiveVolumeAttachmentsForNode(c.volumeAttachementLister, nodeName) if err != nil { - klog.Errorf("(deleteNodeVolAttachmentsAndWaitForDetach) Error obtaining VolumeAttachment(s) for node %q: ", nodeName, err) + klog.Errorf("(deleteNodeVolAttachments) Error obtaining VolumeAttachment(s) for node %q: ", nodeName, err) return retryPeriod, err } - lastOpStateTransitionTime := lastOp.LastStateTransitionTime.Time if len(liveNodeVolAttachments) == 0 { - waitedTime := metav1.Now().Sub(lastOpStateTransitionTime) - if waitedTime < detachTimeout { - klog.V(3).Infof("(deleteNodeVolAttachmentsAndWaitForDetach) For node %q, #node.Status.VolumesAttached=%d, lastOpStateTransitionTime: %s, wait more detachTimeout: %s not expired", - nodeName, len(node.Status.VolumesAttached), lastOpStateTransitionTime, detachTimeout) - return retryPeriod, nil - } - description = fmt.Sprintf("(deleteNodeVolAttachmentsAndWaitForDetach) Timeout: %s expired. Moving to VM Deletion. %s", detachTimeout, machineutils.InitiateVMDeletion) + description = fmt.Sprintf("No Live VolumeAttachments for node: %s. Moving to VM Deletion. %s", nodeName, machineutils.InitiateVMDeletion) state = v1alpha1.MachineStateProcessing - retryPeriod = 0 } else { err = deleteVolumeAttachmentsForNode(ctx, c.targetCoreClient.StorageV1().VolumeAttachments(), nodeName, liveNodeVolAttachments) return retryPeriod, nil @@ -1220,11 +1205,10 @@ func (c *controller) deleteNodeVolAttachmentsAndWaitForDetach(ctx context.Contex ctx, machine, v1alpha1.LastOperation{ - Description: description, - State: state, - Type: machine.Status.LastOperation.Type, - LastUpdateTime: now, - LastStateTransitionTime: now, + Description: description, + State: state, + Type: machine.Status.LastOperation.Type, + LastUpdateTime: now, }, machine.Status.CurrentStatus, machine.Status.LastKnownState, From 746e3ac4662a35bd83404664ba925b0df78d0138 Mon Sep 17 00:00:00 2001 From: elankath Date: Tue, 12 Sep 2023 11:54:34 +0530 Subject: [PATCH 03/11] removed debug message --- pkg/util/provider/machinecontroller/machine_util.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index e26bcd8a5..a86c7b0e6 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1020,7 +1020,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } - klog.V(3).Infof("(drainNode) HOWDY. nodeReadyCondition: %s, readOnlyFileSystemCondition: %s", nodeReadyCondition, readOnlyFileSystemCondition) + klog.V(3).Infof("(drainNode) nodeReadyCondition: %s, readOnlyFileSystemCondition: %s", nodeReadyCondition, readOnlyFileSystemCondition) if !isConditionEmpty(nodeReadyCondition) && (nodeReadyCondition.Status != v1.ConditionTrue) && (time.Since(nodeReadyCondition.LastTransitionTime.Time) > nodeNotReadyDuration) { message := "Setting forceDeletePods & forceDeleteMachine to true for drain as machine is NotReady for over 5min" forceDeleteMachine = true From 581780f8b6a288c2d9f6beec25c3d2d34741465b Mon Sep 17 00:00:00 2001 From: elankath Date: Tue, 12 Sep 2023 11:57:30 +0530 Subject: [PATCH 04/11] corrected stage name --- pkg/util/provider/machinecontroller/machine.go | 2 +- .../provider/machinecontroller/machine_test.go | 16 ++++++++-------- .../provider/machinecontroller/machine_util.go | 4 ++-- pkg/util/provider/machineutils/utils.go | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 684e7a61f..dbd95b480 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -593,7 +593,7 @@ func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineReque case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateDrain): return c.drainNode(ctx, deleteMachineRequest) - case strings.Contains(machine.Status.LastOperation.Description, machineutils.DelVolumesAttachmentsAndWaitForDetach): + case strings.Contains(machine.Status.LastOperation.Description, machineutils.DelVolumesAttachments): return c.deleteNodeVolAttachments(ctx, deleteMachineRequest) case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateVMDeletion): diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index c8f70e511..ee9bbe447 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -1697,7 +1697,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach)), + err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1716,7 +1716,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), + Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -1799,7 +1799,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach)), + err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1818,7 +1818,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), + Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -1906,7 +1906,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach)), + err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1925,7 +1925,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), + Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -2237,7 +2237,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), + Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.DelVolumesAttachments), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -2451,7 +2451,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach), + Description: fmt.Sprintf("Drain failed due to - Failed to update node. However, since it's a force deletion shall continue deletion of VM. %s", machineutils.DelVolumesAttachments), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index a86c7b0e6..65700a90a 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1116,7 +1116,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver klog.V(2).Infof("Drain successful for machine %q ,providerID %q, backing node %q. \nBuf:%v \nErrBuf:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf) if forceDeletePods { - description = fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachmentsAndWaitForDetach) + description = fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments) err = fmt.Errorf(description) } else { // regular drain already waits for vol detach and attach for another node. description = fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion) @@ -1129,7 +1129,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver // Drain failed on force deletion klog.Warningf("Drain failed for machine %q. However, since it's a force deletion shall continue deletion of VM. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err) - description = fmt.Sprintf("Drain failed due to - %s. However, since it's a force deletion shall continue deletion of VM. %s", err.Error(), machineutils.DelVolumesAttachmentsAndWaitForDetach) + description = fmt.Sprintf("Drain failed due to - %s. However, since it's a force deletion shall continue deletion of VM. %s", err.Error(), machineutils.DelVolumesAttachments) state = v1alpha1.MachineStateProcessing } else { klog.Warningf("Drain failed for machine %q , providerID %q ,backing node %q. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf, err) diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index bb9bf4d38..55914bb61 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -30,8 +30,8 @@ const ( // InitiateDrain specifies next step as initiate node drain InitiateDrain = "Initiate node drain" - // DelVolumesAttachmentsAndWaitForDetach specifies next step as deleting volume attachments and waiting for their detachment. - DelVolumesAttachmentsAndWaitForDetach = "Delete Volume Attachments and Wait For Detach" + // DelVolumesAttachments specifies next step as deleting volume attachments + DelVolumesAttachments = "Delete Volume Attachments and Wait For Detach" // InitiateVMDeletion specifies next step as initiate VM deletion InitiateVMDeletion = "Initiate VM deletion" From 98e21eb44421c1c8ed6828e3fa7b6c25a12e867f Mon Sep 17 00:00:00 2001 From: elankath Date: Tue, 12 Sep 2023 12:01:21 +0530 Subject: [PATCH 05/11] reverted trivial changes to drain --- pkg/util/provider/drain/drain.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pkg/util/provider/drain/drain.go b/pkg/util/provider/drain/drain.go index db3622545..0b5a9f6e4 100644 --- a/pkg/util/provider/drain/drain.go +++ b/pkg/util/provider/drain/drain.go @@ -33,6 +33,8 @@ import ( "time" "github.com/Masterminds/semver" + "github.com/gardener/machine-controller-manager/pkg/util/k8sutils" + "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" policyv1beta1 "k8s.io/api/policy/v1beta1" @@ -47,9 +49,6 @@ import ( policyv1listers "k8s.io/client-go/listers/policy/v1" policyv1beta1listers "k8s.io/client-go/listers/policy/v1beta1" "k8s.io/klog/v2" - - "github.com/gardener/machine-controller-manager/pkg/util/k8sutils" - "github.com/gardener/machine-controller-manager/pkg/util/provider/driver" ) // Options are configurable options while draining a node before deletion @@ -212,9 +211,7 @@ func (o *Options) RunDrain(ctx context.Context) error { } err := o.deleteOrEvictPodsSimple(ctx) - return err - } func (o *Options) deleteOrEvictPodsSimple(ctx context.Context) error { From bed996726bda86aa27014af26197fe2945d9ba98 Mon Sep 17 00:00:00 2001 From: elankath Date: Thu, 21 Sep 2023 12:08:10 +0530 Subject: [PATCH 06/11] Addressed review comments and documentation --- docs/FAQ.md | 19 +++++++++++++++++++ .../machinecontroller/machine_util.go | 12 +++++++----- pkg/util/provider/machineutils/utils.go | 2 +- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 9ede8f41f..f9284778b 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -37,6 +37,7 @@ this document. Few of the answers assume that the MCM being used is in conjuctio * [What health checks are performed on a machine?](#what-health-checks-are-performed-on-a-machine) * [How does rate limiting replacement of machine work in MCM ? How is it related to meltdown protection?](#how-does-rate-limiting-replacement-of-machine-work-in-mcm-how-is-it-related-to-meltdown-protection) * [How MCM responds when scale-out/scale-in is done during rolling update of a machinedeployment?](#how-mcm-responds-when-scale-outscale-in-is-done-during-rolling-update-of-a-machinedeployment) + * [How does MCM handle an unhealthy node?](#how-does-mcm-handle-an-unhealthy-node) * [Troubleshooting](#troubleshooting) * [My machine is stuck in deletion for 1 hr, why?](#My-machine-is-stuck-in-deletion-for-1-hr-why) @@ -256,6 +257,24 @@ During update for scaling event, a machineSet is updated if any of the below is Once scaling is achieved, rollout continues. +## How does MCM handle an unhealthy node ? + +A node is considered unhealthy if the `Ready` condition is `False` or one of the +supplementary: `KernelDeadlock`, `ReadonlyFilesystem`, `DiskPressure` , `NetworkUnavailable` is `True` + +As soon as a node is detected as unhealthy, the controller health-check moves the machine phase to `Unknown`. If the +node is unhealthy for more than the `machine-health-timeout` specified for the `machine-controller`, the controller +health-check moves the machine phase to `Failed`. By default, the `machine-health-timeout` is 10` minutes. + +`Failed` machines have their deletion timestamp set and the machine then moves to the `Terminating` phase. The node +drain process is initiated. If the machine has not been `Ready` for greater than `5` minutes then the node is forcefully +drained. Other-wise the node is gracefully drained. This is followed by the deletion of the cloud provider VM associated +with the `Machine` and then finally ending with the `Node` object deletion. + + + + + # Troubleshooting ### My machine is stuck in deletion for 1 hr, why? diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 65700a90a..aa9e1c2cf 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1140,7 +1140,6 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } - now := metav1.Now() c.machineStatusUpdate( ctx, machine, @@ -1148,7 +1147,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver Description: description, State: state, Type: v1alpha1.MachineOperationDelete, - LastUpdateTime: now, + LastUpdateTime: metav1.Now(), }, // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. // This helps while computing when the drain timeout to determine if force deletion is to be triggered. @@ -1160,8 +1159,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver return machineutils.ShortRetry, err } -// deleteNodeVolAttachments deletes VolumeAttachment(s) for a node and waits tillvolumes are detached from the node or detach timeout exceeded -// before moving to VM deletion stage. +// deleteNodeVolAttachments deletes VolumeAttachment(s) for a node before moving to VM deletion stage. func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( description string @@ -1196,6 +1194,11 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine state = v1alpha1.MachineStateProcessing } else { err = deleteVolumeAttachmentsForNode(ctx, c.targetCoreClient.StorageV1().VolumeAttachments(), nodeName, liveNodeVolAttachments) + if err != nil { + klog.Errorf("(deleteNodeVolAttachments) Error deleting volume attachments for node %q: %s", nodeName, err) + } else { + klog.V(3).Infof("(deleteNodeVolAttachments) Successfully deleted all volume attachments for node %q", nodeName) + } return retryPeriod, nil } } @@ -1595,7 +1598,6 @@ func deleteVolumeAttachmentsForNode(ctx context.Context, attachIf storageclient. for _, va := range volAttachments { err := attachIf.Delete(ctx, va.Name, delOpts) if err != nil { - klog.Errorf("(deleteVolumeAttachmentsForNode) Error deleting VolumeAttachment %q for node %q: %s", va.Name, nodeName, err) errs = append(errs, err) } klog.V(4).Infof("(deleteVolumeAttachmentsForNode) Deleted VolumeAttachment %q for node %q", va.Name, nodeName) diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index 55914bb61..8bc546a0f 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -31,7 +31,7 @@ const ( InitiateDrain = "Initiate node drain" // DelVolumesAttachments specifies next step as deleting volume attachments - DelVolumesAttachments = "Delete Volume Attachments and Wait For Detach" + DelVolumesAttachments = "Delete Volume Attachments" // InitiateVMDeletion specifies next step as initiate VM deletion InitiateVMDeletion = "Initiate VM deletion" From 51cd01c7a5321edf9487c7f9f9f83b8739a98c17 Mon Sep 17 00:00:00 2001 From: elankath Date: Thu, 21 Sep 2023 12:45:36 +0530 Subject: [PATCH 07/11] fixed anchor link for new q in faq --- docs/FAQ.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index f9284778b..30ad30923 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -37,7 +37,7 @@ this document. Few of the answers assume that the MCM being used is in conjuctio * [What health checks are performed on a machine?](#what-health-checks-are-performed-on-a-machine) * [How does rate limiting replacement of machine work in MCM ? How is it related to meltdown protection?](#how-does-rate-limiting-replacement-of-machine-work-in-mcm-how-is-it-related-to-meltdown-protection) * [How MCM responds when scale-out/scale-in is done during rolling update of a machinedeployment?](#how-mcm-responds-when-scale-outscale-in-is-done-during-rolling-update-of-a-machinedeployment) - * [How does MCM handle an unhealthy node?](#how-does-mcm-handle-an-unhealthy-node) + * [How does MCM handle an unhealthy node?](#how-does-mcm-handle-an-unhealthy-node-) * [Troubleshooting](#troubleshooting) * [My machine is stuck in deletion for 1 hr, why?](#My-machine-is-stuck-in-deletion-for-1-hr-why) From 3d34d1f52304bb4070df4c989b70e1f47f3fc859 Mon Sep 17 00:00:00 2001 From: elankath Date: Thu, 21 Sep 2023 13:44:21 +0530 Subject: [PATCH 08/11] changed FAQ q to explain forceful deletion of unhealthy nodes --- docs/FAQ.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 30ad30923..693048147 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -37,7 +37,7 @@ this document. Few of the answers assume that the MCM being used is in conjuctio * [What health checks are performed on a machine?](#what-health-checks-are-performed-on-a-machine) * [How does rate limiting replacement of machine work in MCM ? How is it related to meltdown protection?](#how-does-rate-limiting-replacement-of-machine-work-in-mcm-how-is-it-related-to-meltdown-protection) * [How MCM responds when scale-out/scale-in is done during rolling update of a machinedeployment?](#how-mcm-responds-when-scale-outscale-in-is-done-during-rolling-update-of-a-machinedeployment) - * [How does MCM handle an unhealthy node?](#how-does-mcm-handle-an-unhealthy-node-) + * [How some unhealthy machines are drained quickly?](#how-some-unhealthy-machines-are-drained-quickly-) * [Troubleshooting](#troubleshooting) * [My machine is stuck in deletion for 1 hr, why?](#My-machine-is-stuck-in-deletion-for-1-hr-why) @@ -257,19 +257,23 @@ During update for scaling event, a machineSet is updated if any of the below is Once scaling is achieved, rollout continues. -## How does MCM handle an unhealthy node ? +## How some unhealthy machines are drained quickly ? -A node is considered unhealthy if the `Ready` condition is `False` or one of the -supplementary: `KernelDeadlock`, `ReadonlyFilesystem`, `DiskPressure` , `NetworkUnavailable` is `True` - -As soon as a node is detected as unhealthy, the controller health-check moves the machine phase to `Unknown`. If the -node is unhealthy for more than the `machine-health-timeout` specified for the `machine-controller`, the controller +If a node is unhealthy for more than the `machine-health-timeout` specified for the `machine-controller`, the controller health-check moves the machine phase to `Failed`. By default, the `machine-health-timeout` is 10` minutes. `Failed` machines have their deletion timestamp set and the machine then moves to the `Terminating` phase. The node -drain process is initiated. If the machine has not been `Ready` for greater than `5` minutes then the node is forcefully -drained. Other-wise the node is gracefully drained. This is followed by the deletion of the cloud provider VM associated -with the `Machine` and then finally ending with the `Node` object deletion. +drain process is initiated. The drain process is invoked either *gracefully* or *forcefully*. + +The usual drain process is graceful. Pods are evicted from the node and the drain process waits until any existing +attached volumes are mounted on new node. However, if the node `Ready` is `False` or the `ReadonlyFilesystem` is `True` +for greater than `5` minutes, then a forceful drain is initiated. In a forceful drain, pods are deleted +and `VolumeAttachment` objects associated with the old node are also deleted. This is followed by the deletion of the +cloud provider VM associated with the `Machine` and then finally ending with the `Node` object deletion. + +During the deletion of the VM we only delete the local data and boot disks associated with the VM. The disks associated +with persistent volumes are left un-touched as their attach/de-detach, mount/unmount processes are handled by k8s +attach-detach controller in conjunction with the CSI driver. From bcb0d0eb80c425a78491e22f5ec188642b4c8c18 Mon Sep 17 00:00:00 2001 From: elankath Date: Thu, 21 Sep 2023 13:55:29 +0530 Subject: [PATCH 09/11] removed excessive whitespace in faq --- docs/FAQ.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 693048147..78672642d 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -275,10 +275,6 @@ During the deletion of the VM we only delete the local data and boot disks assoc with persistent volumes are left un-touched as their attach/de-detach, mount/unmount processes are handled by k8s attach-detach controller in conjunction with the CSI driver. - - - - # Troubleshooting ### My machine is stuck in deletion for 1 hr, why? From b07a41f539fad89840567e907481360f3407e8a3 Mon Sep 17 00:00:00 2001 From: elankath Date: Fri, 22 Sep 2023 12:57:28 +0530 Subject: [PATCH 10/11] del of vol attachments log machine name, minor faq update --- docs/FAQ.md | 6 +++--- .../machinecontroller/machine_util.go | 21 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 78672642d..471f9acb1 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -267,11 +267,11 @@ drain process is initiated. The drain process is invoked either *gracefully* or The usual drain process is graceful. Pods are evicted from the node and the drain process waits until any existing attached volumes are mounted on new node. However, if the node `Ready` is `False` or the `ReadonlyFilesystem` is `True` -for greater than `5` minutes, then a forceful drain is initiated. In a forceful drain, pods are deleted -and `VolumeAttachment` objects associated with the old node are also deleted. This is followed by the deletion of the +for greater than `5` minutes (non-configurable), then a forceful drain is initiated. In a forceful drain, pods are deleted +and `VolumeAttachment` objects associated with the old node are also marked for deletion. This is followed by the deletion of the cloud provider VM associated with the `Machine` and then finally ending with the `Node` object deletion. -During the deletion of the VM we only delete the local data and boot disks associated with the VM. The disks associated +During the deletion of the VM we only delete the local data disks and boot disks associated with the VM. The disks associated with persistent volumes are left un-touched as their attach/de-detach, mount/unmount processes are handled by k8s attach-detach controller in conjunction with the CSI driver. diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index aa9e1c2cf..df445192d 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -1020,7 +1020,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver } } - klog.V(3).Infof("(drainNode) nodeReadyCondition: %s, readOnlyFileSystemCondition: %s", nodeReadyCondition, readOnlyFileSystemCondition) + klog.V(3).Infof("(drainNode) For node %q, machine %q, nodeReadyCondition: %s, readOnlyFileSystemCondition: %s", nodeName, machine.Name, nodeReadyCondition, readOnlyFileSystemCondition) if !isConditionEmpty(nodeReadyCondition) && (nodeReadyCondition.Status != v1.ConditionTrue) && (time.Since(nodeReadyCondition.LastTransitionTime.Time) > nodeNotReadyDuration) { message := "Setting forceDeletePods & forceDeleteMachine to true for drain as machine is NotReady for over 5min" forceDeleteMachine = true @@ -1116,12 +1116,11 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver klog.V(2).Infof("Drain successful for machine %q ,providerID %q, backing node %q. \nBuf:%v \nErrBuf:%v", machine.Name, getProviderID(machine), getNodeName(machine), buf, errBuf) if forceDeletePods { - description = fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments) - err = fmt.Errorf(description) + description = fmt.Sprintf("Force Drain successful. %s", machineutils.DelVolumesAttachments) } else { // regular drain already waits for vol detach and attach for another node. description = fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion) - err = fmt.Errorf("Machine deletion in process. " + description) } + err = fmt.Errorf(description) state = v1alpha1.MachineStateProcessing // Return error even when machine object is updated @@ -1184,9 +1183,9 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine retryPeriod = 0 } else { // case: where node.Status.VolumesAttached > 0 - liveNodeVolAttachments, err := getLiveVolumeAttachmentsForNode(c.volumeAttachementLister, nodeName) + liveNodeVolAttachments, err := getLiveVolumeAttachmentsForNode(c.volumeAttachementLister, nodeName, machine.Name) if err != nil { - klog.Errorf("(deleteNodeVolAttachments) Error obtaining VolumeAttachment(s) for node %q: ", nodeName, err) + klog.Errorf("(deleteNodeVolAttachments) Error obtaining VolumeAttachment(s) for node %q, machine %q: %s", nodeName, machine.Name, err) return retryPeriod, err } if len(liveNodeVolAttachments) == 0 { @@ -1195,15 +1194,15 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine } else { err = deleteVolumeAttachmentsForNode(ctx, c.targetCoreClient.StorageV1().VolumeAttachments(), nodeName, liveNodeVolAttachments) if err != nil { - klog.Errorf("(deleteNodeVolAttachments) Error deleting volume attachments for node %q: %s", nodeName, err) + klog.Errorf("(deleteNodeVolAttachments) Error deleting volume attachments for node %q, machine %q: %s", nodeName, machine.Name, err) } else { - klog.V(3).Infof("(deleteNodeVolAttachments) Successfully deleted all volume attachments for node %q", nodeName) + klog.V(3).Infof("(deleteNodeVolAttachments) Successfully deleted all volume attachments for node %q, machine %q", nodeName, machine.Name) } return retryPeriod, nil } } now := metav1.Now() - klog.V(4).Infof("(deleteVolumeAttachmentsForNode) For node %q, set LastOperation.Description: %q", nodeName, description) + klog.V(4).Infof("(deleteVolumeAttachmentsForNode) For node %q, machine %q, set LastOperation.Description: %q", nodeName, machine.Name, description) err = c.machineStatusUpdate( ctx, machine, @@ -1577,10 +1576,10 @@ func (c *controller) tryMarkingMachineFailed(ctx context.Context, machine, clone return machineutils.ShortRetry, err } -func getLiveVolumeAttachmentsForNode(volAttachLister storagelisters.VolumeAttachmentLister, nodeName string) ([]*storagev1.VolumeAttachment, error) { +func getLiveVolumeAttachmentsForNode(volAttachLister storagelisters.VolumeAttachmentLister, nodeName string, machineName string) ([]*storagev1.VolumeAttachment, error) { volAttachments, err := volAttachLister.List(labels.NewSelector()) if err != nil { - return nil, fmt.Errorf("cant list volume attachments for node %q: %w", nodeName, err) + return nil, fmt.Errorf("cant list volume attachments for node %q, machine %q: %w", nodeName, machineName, err) } nodeVolAttachments := make([]*storagev1.VolumeAttachment, 0, len(volAttachments)) for _, va := range volAttachments { From 632e2992f1ad2b6ec9f70cfd4dcccf611422d979 Mon Sep 17 00:00:00 2001 From: elankath Date: Fri, 22 Sep 2023 13:13:40 +0530 Subject: [PATCH 11/11] fixed unit tests after lastOperation.description adjust --- .../provider/machinecontroller/machine_test.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index ee9bbe447..3e49b0eb8 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -1492,7 +1492,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Machine deletion in process. Drain successful. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf("Drain successful. %s", machineutils.InitiateVMDeletion), retry: machineutils.ShortRetry, nodeTerminationConditionIsSet: true, machine: newMachine( @@ -1697,7 +1697,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments)), + err: fmt.Errorf(fmt.Sprintf("Force Drain successful. %s", machineutils.DelVolumesAttachments)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1716,7 +1716,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments), + Description: fmt.Sprintf("Force Drain successful. %s", machineutils.DelVolumesAttachments), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -1799,7 +1799,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments)), + err: fmt.Errorf(fmt.Sprintf("Force Drain successful. %s", machineutils.DelVolumesAttachments)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1818,7 +1818,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments), + Description: fmt.Sprintf("Force Drain successful. %s", machineutils.DelVolumesAttachments), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -1906,7 +1906,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf(fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments)), + err: fmt.Errorf(fmt.Sprintf("Force Drain successful. %s", machineutils.DelVolumesAttachments)), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -1925,7 +1925,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.DelVolumesAttachments), + Description: fmt.Sprintf("Force Drain successful. %s", machineutils.DelVolumesAttachments), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -2008,7 +2008,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Machine deletion in process. Drain successful. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf("Drain successful. %s", machineutils.InitiateVMDeletion), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -2110,7 +2110,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Machine deletion in process. Drain successful. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf("Drain successful. %s", machineutils.InitiateVMDeletion), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{