diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md index e81cb42460e1..221c55d6e516 100644 --- a/cluster-autoscaler/FAQ.md +++ b/cluster-autoscaler/FAQ.md @@ -785,6 +785,7 @@ The following startup parameters are supported for cluster autoscaler: | `aws-use-static-instance-list` | Should CA fetch instance types in runtime or use a static list. AWS only | false | `skip-nodes-with-system-pods` | If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods) | true | `skip-nodes-with-local-storage`| If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath | true +| `skip-nodes-with-custom-controller-pods` | If true cluster autoscaler will never delete nodes with pods owned by custom controllers | true | `min-replica-count` | Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down | 0 | `daemonset-eviction-for-empty-nodes` | Whether DaemonSet pods will be gracefully terminated from empty nodes | false | `daemonset-eviction-for-occupied-nodes` | Whether DaemonSet pods will be gracefully terminated from non-empty nodes | true diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 53fdb08fea7d..595cf1d37a5a 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -240,6 +240,8 @@ type AutoscalingOptions struct { SkipNodesWithSystemPods bool // SkipNodesWithLocalStorage tells if nodes with pods with local storage, e.g. EmptyDir or HostPath, should be deleted SkipNodesWithLocalStorage bool + // SkipNodesWithCustomControllerPods tells if nodes with custom-controller owned pods should be skipped from deletion (skip if 'true') + SkipNodesWithCustomControllerPods bool // MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have // to allow their pods deletion in scale down MinReplicaCount int diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 9121f8e78002..73fbb6ded0f9 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -169,9 +169,10 @@ func NewStaticAutoscaler( processors.ScaleDownCandidatesNotifier.Register(clusterStateRegistry) deleteOptions := simulator.NodeDeleteOptions{ - SkipNodesWithSystemPods: opts.SkipNodesWithSystemPods, - SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage, - MinReplicaCount: opts.MinReplicaCount, + SkipNodesWithSystemPods: opts.SkipNodesWithSystemPods, + SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage, + MinReplicaCount: opts.MinReplicaCount, + SkipNodesWithCustomControllerPods: opts.SkipNodesWithCustomControllerPods, } // TODO: Populate the ScaleDownActuator/Planner fields in AutoscalingContext diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 54c07c45afe9..b46b927333d0 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -216,6 +216,7 @@ var ( maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.") skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)") skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath") + skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers") minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down") nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it") scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.") @@ -328,6 +329,7 @@ func createAutoscalingOptions() config.AutoscalingOptions { NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint, ScaleDownSimulationTimeout: *scaleDownSimulationTimeout, ParallelDrain: *parallelDrain, + SkipNodesWithCustomControllerPods: *skipNodesWithCustomControllerPods, NodeGroupSetRatios: config.NodeGroupDifferenceRatios{ MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio, MaxAllocatableDifferenceRatio: *maxAllocatableDifferenceRatio, diff --git a/cluster-autoscaler/simulator/drain.go b/cluster-autoscaler/simulator/drain.go index a21f9c79a2da..fe44a8d7c1c9 100644 --- a/cluster-autoscaler/simulator/drain.go +++ b/cluster-autoscaler/simulator/drain.go @@ -35,6 +35,8 @@ type NodeDeleteOptions struct { SkipNodesWithSystemPods bool // SkipNodesWithLocalStorage tells if nodes with pods with local storage, e.g. EmptyDir or HostPath, should be deleted SkipNodesWithLocalStorage bool + // SkipNodesWithCustomControllerPods tells if nodes with custom-controller owned pods should be skipped from deletion (skip if 'true') + SkipNodesWithCustomControllerPods bool // MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have // to allow their pods deletion in scale down MinReplicaCount int @@ -57,6 +59,7 @@ func GetPodsToMove(nodeInfo *schedulerframework.NodeInfo, deleteOptions NodeDele pdbs, deleteOptions.SkipNodesWithSystemPods, deleteOptions.SkipNodesWithLocalStorage, + deleteOptions.SkipNodesWithCustomControllerPods, listers, int32(deleteOptions.MinReplicaCount), timestamp) diff --git a/cluster-autoscaler/simulator/drain_test.go b/cluster-autoscaler/simulator/drain_test.go index 46ebb7542a47..2275f35ea796 100644 --- a/cluster-autoscaler/simulator/drain_test.go +++ b/cluster-autoscaler/simulator/drain_test.go @@ -42,9 +42,10 @@ func TestGetPodsToMove(t *testing.T) { }, } deleteOptions := NodeDeleteOptions{ - SkipNodesWithSystemPods: true, - SkipNodesWithLocalStorage: true, - MinReplicaCount: 0, + SkipNodesWithSystemPods: true, + SkipNodesWithLocalStorage: true, + MinReplicaCount: 0, + SkipNodesWithCustomControllerPods: true, } _, _, blockingPod, err := GetPodsToMove(schedulerframework.NewNodeInfo(pod1), deleteOptions, nil, nil, testTime) assert.Error(t, err) diff --git a/cluster-autoscaler/utils/drain/drain.go b/cluster-autoscaler/utils/drain/drain.go index 4dd8efc58ff8..85096c383778 100644 --- a/cluster-autoscaler/utils/drain/drain.go +++ b/cluster-autoscaler/utils/drain/drain.go @@ -78,13 +78,13 @@ func GetPodsForDeletionOnNodeDrain( pdbs []*policyv1.PodDisruptionBudget, skipNodesWithSystemPods bool, skipNodesWithLocalStorage bool, + skipNodesWithCustomControllerPods bool, listers kube_util.ListerRegistry, minReplica int32, currentTime time.Time) (pods []*apiv1.Pod, daemonSetPods []*apiv1.Pod, blockingPod *BlockingPod, err error) { pods = []*apiv1.Pod{} daemonSetPods = []*apiv1.Pod{} - checkReferences := listers != nil // filter kube-system PDBs to avoid doing it for every kube-system pod kubeSystemPDBs := make([]*policyv1.PodDisruptionBudget, 0) for _, pdb := range pdbs { @@ -93,6 +93,7 @@ func GetPodsForDeletionOnNodeDrain( } } + isDaemonSetPod := false for _, pod := range podList { if pod_util.IsMirrorPod(pod) { continue @@ -106,101 +107,25 @@ func GetPodsForDeletionOnNodeDrain( continue } - isDaemonSetPod := false replicated := false safeToEvict := hasSafeToEvictAnnotation(pod) terminal := isPodTerminal(pod) - controllerRef := ControllerRef(pod) - refKind := "" - if controllerRef != nil { - refKind = controllerRef.Kind - } - - // For now, owner controller must be in the same namespace as the pod - // so OwnerReference doesn't have its own Namespace field - controllerNamespace := pod.Namespace - - if refKind == "ReplicationController" { - if checkReferences { - rc, err := listers.ReplicationControllerLister().ReplicationControllers(controllerNamespace).Get(controllerRef.Name) - // Assume a reason for an error is because the RC is either - // gone/missing or that the rc has too few replicas configured. - // TODO: replace the minReplica check with pod disruption budget. - if err == nil && rc != nil { - if rc.Spec.Replicas != nil && *rc.Spec.Replicas < minReplica { - return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d", - pod.Namespace, pod.Name, rc.Spec.Replicas, minReplica) - } - replicated = true - } else { - return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err) - } - } else { - replicated = true + if skipNodesWithCustomControllerPods { + // TODO(vadasambar): remove this when we get rid of skipNodesWithCustomControllerPods + replicated, isDaemonSetPod, blockingPod, err = legacyCheckForReplicatedPods(listers, pod, minReplica) + if err != nil { + return []*apiv1.Pod{}, []*apiv1.Pod{}, blockingPod, err } - } else if pod_util.IsDaemonSetPod(pod) { - isDaemonSetPod = true - // don't have listener for other DaemonSet kind - // TODO: we should use a generic client for checking the reference. - if checkReferences && refKind == "DaemonSet" { - _, err := listers.DaemonSetLister().DaemonSets(controllerNamespace).Get(controllerRef.Name) - if apierrors.IsNotFound(err) { - return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("daemonset for %s/%s is not present, err: %v", pod.Namespace, pod.Name, err) - } else if err != nil { - return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: UnexpectedError}, fmt.Errorf("error when trying to get daemonset for %s/%s , err: %v", pod.Namespace, pod.Name, err) - } - } - } else if refKind == "Job" { - if checkReferences { - job, err := listers.JobLister().Jobs(controllerNamespace).Get(controllerRef.Name) - - // Assume the only reason for an error is because the Job is - // gone/missing, not for any other cause. TODO(mml): something more - // sophisticated than this - if err == nil && job != nil { - replicated = true - } else { - return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("job for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err) - } - } else { + } else { + if ControllerRef(pod) != nil { replicated = true } - } else if refKind == "ReplicaSet" { - if checkReferences { - rs, err := listers.ReplicaSetLister().ReplicaSets(controllerNamespace).Get(controllerRef.Name) - - // Assume the only reason for an error is because the RS is - // gone/missing, not for any other cause. TODO(mml): something more - // sophisticated than this - if err == nil && rs != nil { - if rs.Spec.Replicas != nil && *rs.Spec.Replicas < minReplica { - return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d", - pod.Namespace, pod.Name, rs.Spec.Replicas, minReplica) - } - replicated = true - } else { - return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err) - } - } else { - replicated = true - } - } else if refKind == "StatefulSet" { - if checkReferences { - ss, err := listers.StatefulSetLister().StatefulSets(controllerNamespace).Get(controllerRef.Name) - - // Assume the only reason for an error is because the StatefulSet is - // gone/missing, not for any other cause. TODO(mml): something more - // sophisticated than this - if err == nil && ss != nil { - replicated = true - } else { - return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("statefulset for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err) - } - } else { - replicated = true + if pod_util.IsDaemonSetPod(pod) { + isDaemonSetPod = true } } + if isDaemonSetPod { daemonSetPods = append(daemonSetPods, pod) continue @@ -231,6 +156,104 @@ func GetPodsForDeletionOnNodeDrain( return pods, daemonSetPods, nil, nil } +func legacyCheckForReplicatedPods(listers kube_util.ListerRegistry, pod *apiv1.Pod, minReplica int32) (replicated bool, isDaemonSetPod bool, blockingPod *BlockingPod, err error) { + replicated = false + refKind := "" + checkReferences := listers != nil + isDaemonSetPod = false + + controllerRef := ControllerRef(pod) + if controllerRef != nil { + refKind = controllerRef.Kind + } + + // For now, owner controller must be in the same namespace as the pod + // so OwnerReference doesn't have its own Namespace field + controllerNamespace := pod.Namespace + if refKind == "ReplicationController" { + if checkReferences { + rc, err := listers.ReplicationControllerLister().ReplicationControllers(controllerNamespace).Get(controllerRef.Name) + // Assume a reason for an error is because the RC is either + // gone/missing or that the rc has too few replicas configured. + // TODO: replace the minReplica check with pod disruption budget. + if err == nil && rc != nil { + if rc.Spec.Replicas != nil && *rc.Spec.Replicas < minReplica { + return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d", + pod.Namespace, pod.Name, rc.Spec.Replicas, minReplica) + } + replicated = true + } else { + return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err) + } + } else { + replicated = true + } + } else if pod_util.IsDaemonSetPod(pod) { + isDaemonSetPod = true + // don't have listener for other DaemonSet kind + // TODO: we should use a generic client for checking the reference. + if checkReferences && refKind == "DaemonSet" { + _, err := listers.DaemonSetLister().DaemonSets(controllerNamespace).Get(controllerRef.Name) + if apierrors.IsNotFound(err) { + return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("daemonset for %s/%s is not present, err: %v", pod.Namespace, pod.Name, err) + } else if err != nil { + return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: UnexpectedError}, fmt.Errorf("error when trying to get daemonset for %s/%s , err: %v", pod.Namespace, pod.Name, err) + } + } + } else if refKind == "Job" { + if checkReferences { + job, err := listers.JobLister().Jobs(controllerNamespace).Get(controllerRef.Name) + + // Assume the only reason for an error is because the Job is + // gone/missing, not for any other cause. TODO(mml): something more + // sophisticated than this + if err == nil && job != nil { + replicated = true + } else { + return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("job for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err) + } + } else { + replicated = true + } + } else if refKind == "ReplicaSet" { + if checkReferences { + rs, err := listers.ReplicaSetLister().ReplicaSets(controllerNamespace).Get(controllerRef.Name) + + // Assume the only reason for an error is because the RS is + // gone/missing, not for any other cause. TODO(mml): something more + // sophisticated than this + if err == nil && rs != nil { + if rs.Spec.Replicas != nil && *rs.Spec.Replicas < minReplica { + return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d", + pod.Namespace, pod.Name, rs.Spec.Replicas, minReplica) + } + replicated = true + } else { + return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err) + } + } else { + replicated = true + } + } else if refKind == "StatefulSet" { + if checkReferences { + ss, err := listers.StatefulSetLister().StatefulSets(controllerNamespace).Get(controllerRef.Name) + + // Assume the only reason for an error is because the StatefulSet is + // gone/missing, not for any other cause. TODO(mml): something more + // sophisticated than this + if err == nil && ss != nil { + replicated = true + } else { + return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("statefulset for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err) + } + } else { + replicated = true + } + } + + return replicated, isDaemonSetPod, &BlockingPod{}, nil +} + // ControllerRef returns the OwnerReference to pod's controller. func ControllerRef(pod *apiv1.Pod) *metav1.OwnerReference { return metav1.GetControllerOf(pod) diff --git a/cluster-autoscaler/utils/drain/drain_test.go b/cluster-autoscaler/utils/drain/drain_test.go index f6e5de7a7638..175416eacad9 100644 --- a/cluster-autoscaler/utils/drain/drain_test.go +++ b/cluster-autoscaler/utils/drain/drain_test.go @@ -17,6 +17,7 @@ limitations under the License. package drain import ( + "fmt" "testing" "time" @@ -33,6 +34,21 @@ import ( "github.com/stretchr/testify/assert" ) +// testOpts represents parameters required for a single unit test +type testOpts struct { + description string + pods []*apiv1.Pod + pdbs []*policyv1.PodDisruptionBudget + rcs []*apiv1.ReplicationController + replicaSets []*appsv1.ReplicaSet + expectFatal bool + expectPods []*apiv1.Pod + expectDaemonSetPods []*apiv1.Pod + expectBlockingPod *BlockingPod + // TODO(vadasambar): remove this when we get rid of scaleDownNodesWithCustomControllerPods + skipNodesWithCustomControllerPods bool +} + func TestDrain(t *testing.T) { testTime := time.Date(2020, time.December, 18, 17, 0, 0, 0, time.UTC) replicas := int32(5) @@ -183,6 +199,21 @@ func TestDrain(t *testing.T) { }, } + customControllerPod := &apiv1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bar", + Namespace: "default", + // Using names like FooController is discouraged + // https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#naming-conventions + // vadasambar: I am using it here just because `FooController`` + // is easier to understand than say `FooSet` + OwnerReferences: GenerateOwnerReferences("Foo", "FooController", "apps/v1", ""), + }, + Spec: apiv1.PodSpec{ + NodeName: "node", + }, + } + nakedPod := &apiv1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "bar", @@ -398,17 +429,7 @@ func TestDrain(t *testing.T) { }, } - tests := []struct { - description string - pods []*apiv1.Pod - pdbs []*policyv1.PodDisruptionBudget - rcs []*apiv1.ReplicationController - replicaSets []*appsv1.ReplicaSet - expectFatal bool - expectPods []*apiv1.Pod - expectDaemonSetPods []*apiv1.Pod - expectBlockingPod *BlockingPod - }{ + sharedTests := []testOpts{ { description: "RC-managed pod", pods: []*apiv1.Pod{rcPod}, @@ -624,7 +645,52 @@ func TestDrain(t *testing.T) { }, } - for _, test := range tests { + allTests := []testOpts{} + // Note: be careful about modifying the underlying reference values for sharedTest + // since they are shared (changing it once will change it for all shallow copies of sharedTest) + for _, sharedTest := range sharedTests { + // make sure you shallow copy the test like this + // before you modify it + // (so that modifying one test doesn't affect another) + enabledTest := sharedTest + disabledTest := sharedTest + + // to execute the same shared tests for when the skipNodesWithCustomControllerPods flag is true + // and when the flag is false + enabledTest.skipNodesWithCustomControllerPods = true + enabledTest.description = fmt.Sprintf("%s with skipNodesWithCustomControllerPods:%v", + enabledTest.description, enabledTest.skipNodesWithCustomControllerPods) + allTests = append(allTests, enabledTest) + + disabledTest.skipNodesWithCustomControllerPods = false + disabledTest.description = fmt.Sprintf("%s with skipNodesWithCustomControllerPods:%v", + disabledTest.description, disabledTest.skipNodesWithCustomControllerPods) + allTests = append(allTests, disabledTest) + } + + allTests = append(allTests, testOpts{ + description: "Custom-controller-managed blocking pod", + pods: []*apiv1.Pod{customControllerPod}, + pdbs: []*policyv1.PodDisruptionBudget{}, + expectFatal: true, + expectPods: []*apiv1.Pod{}, + expectBlockingPod: &BlockingPod{Pod: customControllerPod, Reason: NotReplicated}, + expectDaemonSetPods: []*apiv1.Pod{}, + skipNodesWithCustomControllerPods: true, + }) + + allTests = append(allTests, testOpts{ + description: "Custom-controller-managed non-blocking pod", + pods: []*apiv1.Pod{customControllerPod}, + pdbs: []*policyv1.PodDisruptionBudget{}, + expectFatal: false, + expectPods: []*apiv1.Pod{customControllerPod}, + expectBlockingPod: &BlockingPod{}, + expectDaemonSetPods: []*apiv1.Pod{}, + skipNodesWithCustomControllerPods: false, + }) + + for _, test := range allTests { var err error var rcLister v1lister.ReplicationControllerLister if len(test.rcs) > 0 { @@ -646,7 +712,7 @@ func TestDrain(t *testing.T) { registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, dsLister, rcLister, jobLister, rsLister, ssLister) - pods, daemonSetPods, blockingPod, err := GetPodsForDeletionOnNodeDrain(test.pods, test.pdbs, true, true, registry, 0, testTime) + pods, daemonSetPods, blockingPod, err := GetPodsForDeletionOnNodeDrain(test.pods, test.pdbs, true, true, test.skipNodesWithCustomControllerPods, registry, 0, testTime) if test.expectFatal { assert.Equal(t, test.expectBlockingPod, blockingPod)