Skip to content

Commit

Permalink
Remove ttlAfterNotRegistered value from settings (#250)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored Mar 27, 2023
1 parent cb03ff7 commit 4eedab8
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 99 deletions.
6 changes: 0 additions & 6 deletions charts/karpenter-core/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,5 @@ settings:
# Setting driftEnabled to true enables the drift deprovisioner to watch for drift between currently deployed nodes
# and the desired state of nodes set in provisioners and node templates
driftEnabled: false
# -- ttlAfterNotRegistered is in ALPHA and is planned to be removed in the future, pending design for handling nodes failing launch.
# The maximum length of time to wait for the machine to register to the cluster before terminating the machine.
# Generally, the default of 15m should be sufficient but raising or lowering this may be necessary depending
# on how slowly or quickly nodes join the cluster.
# Setting this value to an empty string disables the ttlAfterNotRegistered deprovisioning
ttlAfterNotRegistered: 15m


16 changes: 5 additions & 11 deletions pkg/apis/settings/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,15 @@ type settingsKeyType struct{}
var ContextKey = settingsKeyType{}

var defaultSettings = &Settings{
BatchMaxDuration: &metav1.Duration{Duration: time.Second * 10},
BatchIdleDuration: &metav1.Duration{Duration: time.Second * 1},
TTLAfterNotRegistered: &metav1.Duration{Duration: time.Minute * 15},
DriftEnabled: false,
BatchMaxDuration: &metav1.Duration{Duration: time.Second * 10},
BatchIdleDuration: &metav1.Duration{Duration: time.Second * 1},
DriftEnabled: false,
}

// +k8s:deepcopy-gen=true
type Settings struct {
BatchMaxDuration *metav1.Duration
BatchIdleDuration *metav1.Duration
TTLAfterNotRegistered *metav1.Duration
BatchMaxDuration *metav1.Duration
BatchIdleDuration *metav1.Duration
// This feature flag is temporary and will be removed in the near future.
DriftEnabled bool
}
Expand All @@ -56,7 +54,6 @@ func (*Settings) Inject(ctx context.Context, cm *v1.ConfigMap) (context.Context,
if err := configmap.Parse(cm.Data,
AsMetaDuration("batchMaxDuration", &s.BatchMaxDuration),
AsMetaDuration("batchIdleDuration", &s.BatchIdleDuration),
AsMetaDuration("ttlAfterNotRegistered", &s.TTLAfterNotRegistered),
configmap.AsBool("featureGates.driftEnabled", &s.DriftEnabled),
); err != nil {
return ctx, fmt.Errorf("parsing settings, %w", err)
Expand All @@ -78,9 +75,6 @@ func (in *Settings) Validate() (err error) {
} else if in.BatchIdleDuration.Duration <= 0 {
err = multierr.Append(err, fmt.Errorf("batchIdleDuration cannot be negative"))
}
if in.TTLAfterNotRegistered != nil && in.TTLAfterNotRegistered.Duration <= 0 {
err = multierr.Append(err, fmt.Errorf("ttlAfterNotRegistered cannot be negative"))
}
return err
}

Expand Down
29 changes: 0 additions & 29 deletions pkg/apis/settings/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,13 @@ var _ = Describe("Validation", func() {
Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 10))
Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second))
Expect(s.DriftEnabled).To(BeFalse())
Expect(s.TTLAfterNotRegistered.Duration).To(Equal(time.Minute * 15))
})
It("should succeed to set custom values", func() {
cm := &v1.ConfigMap{
Data: map[string]string{
"batchMaxDuration": "30s",
"batchIdleDuration": "5s",
"featureGates.driftEnabled": "true",
"ttlAfterNotRegistered": "30m",
},
}
ctx, err := (&settings.Settings{}).Inject(ctx, cm)
Expand All @@ -63,24 +61,6 @@ var _ = Describe("Validation", func() {
Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 30))
Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second * 5))
Expect(s.DriftEnabled).To(BeTrue())
Expect(s.TTLAfterNotRegistered.Duration).To(Equal(time.Minute * 30))
})
It("should succeed to disable ttlAfterNotRegistered", func() {
cm := &v1.ConfigMap{
Data: map[string]string{
"batchMaxDuration": "30s",
"batchIdleDuration": "5s",
"featureGates.driftEnabled": "true",
"ttlAfterNotRegistered": "",
},
}
ctx, err := (&settings.Settings{}).Inject(ctx, cm)
Expect(err).ToNot(HaveOccurred())
s := settings.FromContext(ctx)
Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 30))
Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second * 5))
Expect(s.DriftEnabled).To(BeTrue())
Expect(s.TTLAfterNotRegistered).To(BeNil())
})
It("should fail validation when batchMaxDuration is negative", func() {
cm := &v1.ConfigMap{
Expand Down Expand Up @@ -127,13 +107,4 @@ var _ = Describe("Validation", func() {
_, err := (&settings.Settings{}).Inject(ctx, cm)
Expect(err).To(HaveOccurred())
})
It("should fail validation when ttlAfterNotRegistered is negative", func() {
cm := &v1.ConfigMap{
Data: map[string]string{
"ttlAfterNotRegistered": "-10s",
},
}
_, err := (&settings.Settings{}).Inject(ctx, cm)
Expect(err).To(HaveOccurred())
})
})
5 changes: 0 additions & 5 deletions pkg/apis/settings/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 8 additions & 8 deletions pkg/controllers/machine/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/karpenter-core/pkg/apis/settings"
"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/metrics"
"github.com/aws/karpenter-core/pkg/utils/result"
Expand All @@ -40,6 +39,10 @@ type Liveness struct {
// If we don't succeed within this time, then we should delete and try again through some other mechanism
const launchTTL = time.Minute * 2

// registrationTTL is a heuristic time that we expect the node to register within
// If we don't see the node within this time, then we should delete the machine and try again
const registrationTTL = time.Minute * 15

func (l *Liveness) Reconcile(ctx context.Context, machine *v1alpha5.Machine) (reconcile.Result, error) {
creationRes, creationErr := l.launchTTL(ctx, machine)
registrationRes, registrationErr := l.registrationTTL(ctx, machine)
Expand All @@ -57,7 +60,7 @@ func (l *Liveness) launchTTL(ctx context.Context, machine *v1alpha5.Machine) (re
if err := l.kubeClient.Delete(ctx, machine); err != nil {
return reconcile.Result{}, client.IgnoreNotFound(err)
}
logging.FromContext(ctx).With("ttl", time.Minute).Debugf("terminating machine since node hasn't created within creation ttl")
logging.FromContext(ctx).With("ttl", launchTTL).Debugf("terminating machine since node hasn't created within creation ttl")
metrics.MachinesTerminatedCounter.With(prometheus.Labels{
metrics.ReasonLabel: "machine_creation_timeout",
metrics.ProvisionerLabel: machine.Labels[v1alpha5.ProvisionerNameLabelKey],
Expand All @@ -66,20 +69,17 @@ func (l *Liveness) launchTTL(ctx context.Context, machine *v1alpha5.Machine) (re
}

func (l *Liveness) registrationTTL(ctx context.Context, machine *v1alpha5.Machine) (reconcile.Result, error) {
if settings.FromContext(ctx).TTLAfterNotRegistered == nil {
return reconcile.Result{}, nil
}
if machine.StatusConditions().GetCondition(v1alpha5.MachineRegistered).IsTrue() {
return reconcile.Result{}, nil
}
if machine.CreationTimestamp.IsZero() || l.clock.Since(machine.CreationTimestamp.Time) < settings.FromContext(ctx).TTLAfterNotRegistered.Duration {
return reconcile.Result{RequeueAfter: settings.FromContext(ctx).TTLAfterNotRegistered.Duration - l.clock.Since(machine.CreationTimestamp.Time)}, nil
if machine.CreationTimestamp.IsZero() || l.clock.Since(machine.CreationTimestamp.Time) < registrationTTL {
return reconcile.Result{RequeueAfter: registrationTTL - l.clock.Since(machine.CreationTimestamp.Time)}, nil
}
// Delete the machine if we believe the machine won't register since we haven't seen the node
if err := l.kubeClient.Delete(ctx, machine); err != nil {
return reconcile.Result{}, client.IgnoreNotFound(err)
}
logging.FromContext(ctx).With("ttl", settings.FromContext(ctx).TTLAfterNotRegistered.Duration).Debugf("terminating machine since node hasn't registered within registration ttl")
logging.FromContext(ctx).With("ttl", registrationTTL).Debugf("terminating machine since node hasn't registered within registration ttl")
metrics.MachinesTerminatedCounter.With(prometheus.Labels{
metrics.ReasonLabel: "machine_registration_timeout",
metrics.ProvisionerLabel: machine.Labels[v1alpha5.ProvisionerNameLabelKey],
Expand Down
32 changes: 0 additions & 32 deletions pkg/controllers/machine/liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/aws/karpenter-core/pkg/apis/settings"
"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/cloudprovider/fake"
"github.com/aws/karpenter-core/pkg/test"
Expand Down Expand Up @@ -97,35 +96,4 @@ var _ = Describe("Liveness", func() {
ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine)) // Reconcile again to handle termination flow
ExpectNotFound(ctx, env.Client, machine)
})
It("should not delete the Machine when the Node hasn't registered to the Machine past the registration TTL if ttlAfterNotRegistered is disabled", func() {
s := test.Settings()
s.TTLAfterNotRegistered = nil
ctx = settings.ToContext(ctx, s)
machine := test.Machine(v1alpha5.Machine{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1alpha5.ProvisionerNameLabelKey: provisioner.Name,
},
},
Spec: v1alpha5.MachineSpec{
Resources: v1alpha5.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("2"),
v1.ResourceMemory: resource.MustParse("50Mi"),
v1.ResourcePods: resource.MustParse("5"),
fake.ResourceGPUVendorA: resource.MustParse("1"),
},
},
},
})
ExpectApplied(ctx, env.Client, provisioner, machine)
ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine))
machine = ExpectExists(ctx, env.Client, machine)

// If the node hasn't registered in the liveness timeframe, then we deprovision the Machine
fakeClock.Step(time.Minute * 20)
ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine))
ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine)) // Reconcile again to handle termination flow
ExpectExists(ctx, env.Client, machine)
})
})
11 changes: 3 additions & 8 deletions pkg/test/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package test

import (
"fmt"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

Expand All @@ -38,13 +37,9 @@ func Settings(overrides ...settings.Settings) *settings.Settings {
if options.BatchIdleDuration == nil {
options.BatchIdleDuration = &metav1.Duration{}
}
if options.TTLAfterNotRegistered == nil {
options.TTLAfterNotRegistered = &metav1.Duration{Duration: time.Minute * 15}
}
return &settings.Settings{
BatchMaxDuration: options.BatchMaxDuration,
BatchIdleDuration: options.BatchIdleDuration,
TTLAfterNotRegistered: options.TTLAfterNotRegistered,
DriftEnabled: options.DriftEnabled,
BatchMaxDuration: options.BatchMaxDuration,
BatchIdleDuration: options.BatchIdleDuration,
DriftEnabled: options.DriftEnabled,
}
}

0 comments on commit 4eedab8

Please sign in to comment.