Skip to content

Commit

Permalink
chore: Add pod/node count to metrics and logs for disruption (#1025)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored Feb 19, 2024
1 parent 144ce31 commit df29083
Show file tree
Hide file tree
Showing 12 changed files with 455 additions and 37 deletions.
28 changes: 22 additions & 6 deletions pkg/controllers/disruption/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc
}

func (c *Controller) disrupt(ctx context.Context, disruption Method) (bool, error) {
defer metrics.Measure(disruptionEvaluationDurationHistogram.With(map[string]string{
defer metrics.Measure(EvaluationDurationHistogram.With(map[string]string{
methodLabel: disruption.Type(),
consolidationTypeLabel: disruption.ConsolidationType(),
}))()
Expand Down Expand Up @@ -181,11 +181,6 @@ func (c *Controller) disrupt(ctx context.Context, disruption Method) (bool, erro
// 2. Spin up replacement nodes
// 3. Add Command to orchestration.Queue to wait to delete the candiates.
func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command, schedulingResults scheduling.Results) error {
disruptionActionsPerformedCounter.With(map[string]string{
actionLabel: string(cmd.Action()),
methodLabel: m.Type(),
consolidationTypeLabel: m.ConsolidationType(),
}).Inc()
commandID := uuid.NewUUID()
logging.FromContext(ctx).With("command-id", commandID).Infof("disrupting via %s %s", m.Type(), cmd)

Expand Down Expand Up @@ -234,6 +229,27 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command,
c.cluster.UnmarkForDeletion(providerIDs...)
return fmt.Errorf("adding command to queue (command-id: %s), %w", commandID, multierr.Append(err, state.RequireNoScheduleTaint(ctx, c.kubeClient, false, stateNodes...)))
}

// An action is only performed and pods/nodes are only disrupted after a successful add to the queue
ActionsPerformedCounter.With(map[string]string{
actionLabel: string(cmd.Action()),
methodLabel: m.Type(),
consolidationTypeLabel: m.ConsolidationType(),
}).Inc()
for _, cd := range cmd.candidates {
NodesDisruptedCounter.With(map[string]string{
metrics.NodePoolLabel: cd.nodePool.Name,
actionLabel: string(cmd.Action()),
methodLabel: m.Type(),
consolidationTypeLabel: m.ConsolidationType(),
}).Inc()
PodsDisruptedCounter.With(map[string]string{
metrics.NodePoolLabel: cd.nodePool.Name,
actionLabel: string(cmd.Action()),
methodLabel: m.Type(),
consolidationTypeLabel: m.ConsolidationType(),
}).Add(float64(len(cd.reschedulablePods)))
}
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/disruption/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func (d *Drift) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[
return candidates[i].NodeClaim.StatusConditions().GetCondition(v1beta1.Drifted).LastTransitionTime.Inner.Time.Before(
candidates[j].NodeClaim.StatusConditions().GetCondition(v1beta1.Drifted).LastTransitionTime.Inner.Time)
})
disruptionEligibleNodesGauge.With(map[string]string{
EligibleNodesGauge.With(map[string]string{
methodLabel: d.Type(),
consolidationTypeLabel: d.ConsolidationType(),
}).Set(float64(len(candidates)))
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/disruption/emptiness.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func (e *Emptiness) ComputeCommand(_ context.Context, disruptionBudgetMapping ma
return cn.NodeClaim.DeletionTimestamp.IsZero() && len(cn.reschedulablePods) == 0
})

disruptionEligibleNodesGauge.With(map[string]string{
EligibleNodesGauge.With(map[string]string{
methodLabel: e.Type(),
consolidationTypeLabel: e.ConsolidationType(),
}).Set(float64(len(candidates)))
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/disruption/emptynodeconsolidation.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func (c *EmptyNodeConsolidation) ComputeCommand(ctx context.Context, disruptionB
return Command{}, scheduling.Results{}, nil
}
candidates = c.sortCandidates(candidates)
disruptionEligibleNodesGauge.With(map[string]string{
EligibleNodesGauge.With(map[string]string{
methodLabel: c.Type(),
consolidationTypeLabel: c.ConsolidationType(),
}).Set(float64(len(candidates)))
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/disruption/expiration.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func (e *Expiration) ComputeCommand(ctx context.Context, disruptionBudgetMapping
return candidates[i].NodeClaim.StatusConditions().GetCondition(v1beta1.Expired).LastTransitionTime.Inner.Time.Before(
candidates[j].NodeClaim.StatusConditions().GetCondition(v1beta1.Expired).LastTransitionTime.Inner.Time)
})
disruptionEligibleNodesGauge.With(map[string]string{
EligibleNodesGauge.With(map[string]string{
methodLabel: e.Type(),
consolidationTypeLabel: e.ConsolidationType(),
}).Set(float64(len(candidates)))
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/disruption/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ func BuildDisruptionBudgets(ctx context.Context, cluster *state.Cluster, clk clo
if allowedDisruptions == 0 {
recorder.Publish(disruptionevents.NodePoolBlocked(lo.ToPtr(nodePool)))
}
disruptionBudgetsAllowedDisruptionsGauge.With(map[string]string{
BudgetsAllowedDisruptionsGauge.With(map[string]string{
metrics.NodePoolLabel: nodePool.Name,
}).Set(float64(allowedDisruptions))
}
Expand Down
45 changes: 35 additions & 10 deletions pkg/controllers/disruption/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,15 @@ import (
)

func init() {
crmetrics.Registry.MustRegister(disruptionEvaluationDurationHistogram, disruptionActionsPerformedCounter,
disruptionEligibleNodesGauge, disruptionConsolidationTimeoutTotalCounter, disruptionBudgetsAllowedDisruptionsGauge)
crmetrics.Registry.MustRegister(
EvaluationDurationHistogram,
ActionsPerformedCounter,
NodesDisruptedCounter,
PodsDisruptedCounter,
EligibleNodesGauge,
ConsolidationTimeoutTotalCounter,
BudgetsAllowedDisruptionsGauge,
)
}

const (
Expand All @@ -36,35 +43,53 @@ const (
)

var (
disruptionEvaluationDurationHistogram = prometheus.NewHistogramVec(
EvaluationDurationHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: disruptionSubsystem,
Name: "evaluation_duration_seconds",
Help: "Duration of the disruption evaluation process in seconds.",
Help: "Duration of the disruption evaluation process in seconds. Labeled by method and consolidation type.",
Buckets: metrics.DurationBuckets(),
},
[]string{methodLabel, consolidationTypeLabel},
)
disruptionActionsPerformedCounter = prometheus.NewCounterVec(
ActionsPerformedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: disruptionSubsystem,
Name: "actions_performed_total",
Help: "Number of disruption actions performed. Labeled by disruption method.",
Help: "Number of disruption actions performed. Labeled by disruption action, method, and consolidation type.",
},
[]string{actionLabel, methodLabel, consolidationTypeLabel},
)
disruptionEligibleNodesGauge = prometheus.NewGaugeVec(
NodesDisruptedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: disruptionSubsystem,
Name: "nodes_disrupted_total",
Help: "Total number of nodes disrupted. Labeled by NodePool, disruption action, method, and consolidation type.",
},
[]string{metrics.NodePoolLabel, actionLabel, methodLabel, consolidationTypeLabel},
)
PodsDisruptedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: disruptionSubsystem,
Name: "pods_disrupted_total",
Help: "Total number of reschedulable pods disrupted on nodes. Labeled by NodePool, disruption action, method, and consolidation type.",
},
[]string{metrics.NodePoolLabel, actionLabel, methodLabel, consolidationTypeLabel},
)
EligibleNodesGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: disruptionSubsystem,
Name: "eligible_nodes",
Help: "Number of nodes eligible for disruption by Karpenter. Labeled by disruption method.",
Help: "Number of nodes eligible for disruption by Karpenter. Labeled by disruption method and consolidation type.",
},
[]string{methodLabel, consolidationTypeLabel},
)
disruptionConsolidationTimeoutTotalCounter = prometheus.NewCounterVec(
ConsolidationTimeoutTotalCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: disruptionSubsystem,
Expand All @@ -73,7 +98,7 @@ var (
},
[]string{consolidationTypeLabel},
)
disruptionBudgetsAllowedDisruptionsGauge = prometheus.NewGaugeVec(
BudgetsAllowedDisruptionsGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: disruptionSubsystem,
Expand Down
4 changes: 2 additions & 2 deletions pkg/controllers/disruption/multinodeconsolidation.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func (m *MultiNodeConsolidation) ComputeCommand(ctx context.Context, disruptionB
return Command{}, scheduling.Results{}, nil
}
candidates = m.sortCandidates(candidates)
disruptionEligibleNodesGauge.With(map[string]string{
EligibleNodesGauge.With(map[string]string{
methodLabel: m.Type(),
consolidationTypeLabel: m.ConsolidationType(),
}).Set(float64(len(candidates)))
Expand Down Expand Up @@ -123,7 +123,7 @@ func (m *MultiNodeConsolidation) firstNConsolidationOption(ctx context.Context,
// binary search to find the maximum number of NodeClaims we can terminate
for min <= max {
if m.clock.Now().After(timeout) {
disruptionConsolidationTimeoutTotalCounter.WithLabelValues(m.ConsolidationType()).Inc()
ConsolidationTimeoutTotalCounter.WithLabelValues(m.ConsolidationType()).Inc()
if lastSavedCommand.candidates == nil {
logging.FromContext(ctx).Debugf("failed to find a multi-node consolidation after timeout, last considered batch had %d", (min+max)/2)
} else {
Expand Down
4 changes: 2 additions & 2 deletions pkg/controllers/disruption/singlenodeconsolidation.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, disruption
return Command{}, scheduling.Results{}, nil
}
candidates = s.sortCandidates(candidates)
disruptionEligibleNodesGauge.With(map[string]string{
EligibleNodesGauge.With(map[string]string{
methodLabel: s.Type(),
consolidationTypeLabel: s.ConsolidationType(),
}).Set(float64(len(candidates)))
Expand All @@ -65,7 +65,7 @@ func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, disruption
continue
}
if s.clock.Now().After(timeout) {
disruptionConsolidationTimeoutTotalCounter.WithLabelValues(s.ConsolidationType()).Inc()
ConsolidationTimeoutTotalCounter.WithLabelValues(s.ConsolidationType()).Inc()
logging.FromContext(ctx).Debugf("abandoning single-node consolidation due to timeout after evaluating %d candidates", i)
return Command{}, scheduling.Results{}, nil
}
Expand Down
Loading

0 comments on commit df29083

Please sign in to comment.