Skip to content

Commit

Permalink
fixup! fixup! VPA: immediately prune stale vpa aggregates
Browse files Browse the repository at this point in the history
Signed-off-by: Max Cao <[email protected]>
  • Loading branch information
maxcao13 committed Jan 11, 2025
1 parent cd63322 commit 811fa14
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (r *podResourceRecommender) GetRecommendedPodResources(containerNameToAggre
}

fraction := 1.0 / float64(containersPerPod)
klog.V(4).Infof("Spreading recommendation across %d containers (fraction %f)", containersPerPod, fraction)
klog.V(5).InfoS("Spreading recommendation across containers", "containerCount", containersPerPod, "fraction", fraction)
minCPU := model.ScaleResource(model.CPUAmountFromCores(*podMinCPUMillicores*0.001), fraction)
minMemory := model.ScaleResource(model.MemoryAmountFromBytes(*podMinMemoryMb*1024*1024), fraction)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,17 +197,24 @@ func (a *AggregateContainerState) MergeContainerState(other *AggregateContainerS
a.LastSampleStart = other.LastSampleStart
}
a.TotalSamplesCount += other.TotalSamplesCount
// if at least one aggregateContainerState is underVPA, then the aggregated state will contribute to a recommendation
if other.IsUnderVPA {
a.IsUnderVPA = true
}
// The merged state should use the pruningGracePeriod of the aggregate with the latest updateTime
if other.LastUpdateTime.After(a.LastUpdateTime) {
a.PruningGracePeriod = other.PruningGracePeriod
a.LastUpdateTime = other.LastUpdateTime
}
}

// NewAggregateContainerState returns a new, empty AggregateContainerState.
func NewAggregateContainerState() *AggregateContainerState {
config := GetAggregationsConfig()
now := time.Now()
return &AggregateContainerState{
AggregateCPUUsage: util.NewDecayingHistogram(config.CPUHistogramOptions, config.CPUHistogramDecayHalfLife),
AggregateMemoryPeaks: util.NewDecayingHistogram(config.MemoryHistogramOptions, config.MemoryHistogramDecayHalfLife),
CreationTime: now,
LastUpdateTime: now,
CreationTime: time.Now(),
}
}

Expand Down
5 changes: 4 additions & 1 deletion vertical-pod-autoscaler/pkg/recommender/model/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,10 @@ func (cluster *ClusterState) AddOrUpdateVpa(apiObject *vpa_types.VerticalPodAuto
// This prevents an old, excessively long grace period from persisting and
// potentially causing the VPA to keep stale aggregates with an outdated grace period.
vpa.PruningGracePeriod = vpa_utils.ParsePruningGracePeriodFromAnnotations(annotationsMap)
for _, containerState := range vpa.aggregateContainerStates {
for _, containerState := range vpa.AggregateContainerStates() {
containerState.UpdatePruningGracePeriod(vpa.PruningGracePeriod)
}
for _, containerState := range vpa.ContainersInitialAggregateState {
containerState.UpdatePruningGracePeriod(vpa.PruningGracePeriod)
}
}
Expand Down
24 changes: 23 additions & 1 deletion vertical-pod-autoscaler/pkg/recommender/routines/recommender.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ func (r *recommender) UpdateVPAs() {
if !found {
continue
}
resources := r.podResourceRecommender.GetRecommendedPodResources(GetContainerNameToAggregateStateMap(vpa), vpa.ContainersPerPod)
vpaContainerNameToAggregateStateMap := GetContainerNameToAggregateStateMap(vpa)
klog.V(5).InfoS("Calculating recommendation pod resources...", "vpa", vpa.ID.VpaName, "namespace", vpa.ID.Namespace, "targetRef", vpa.TargetRef)
resources := r.podResourceRecommender.GetRecommendedPodResources(vpaContainerNameToAggregateStateMap, vpa.ContainersPerPod)
had := vpa.HasRecommendation()

listOfResourceRecommendation := logic.MapToListOfRecommendedContainerResources(resources)
Expand All @@ -100,6 +102,26 @@ func (r *recommender) UpdateVPAs() {
listOfResourceRecommendation = postProcessor.Process(observedVpa, listOfResourceRecommendation)
}

if had {
totalTrackedContainerNames := len(vpaContainerNameToAggregateStateMap)
if totalTrackedContainerNames != vpa.ContainersPerPod {
// this means that there are some tracked containers that are stale, but haven't been pruned yet (or won't be pruned be at all) according to vpa pruningGracePeriod
// figure out which containers are stale, and just replace the calculated recommendation with the previous one in the previous vpa.Status
// TODO: We can save computation by checking staleness early and don't calculate a recommendation
now := time.Now()
for i, resources := range observedVpa.Status.Recommendation.ContainerRecommendations {
containerName := resources.ContainerName
containerAggregateState := vpaContainerNameToAggregateStateMap[containerName]
if containerAggregateState != nil && !containerAggregateState.IsUnderVPA && !containerAggregateState.IsAggregateStale(now) {
klog.V(5).InfoS("Container no longer exists, but is not stale. Keeping container recommendation at previous state.", "vpa", vpa.ID.VpaName, "container", containerName)
previousContainerRecommendation := vpa.AsStatus().Recommendation.ContainerRecommendations[i]
// Maybe we should add a "stale=true" field to persist in the vpa.Status.ContainerRecommendation[i] as well?
listOfResourceRecommendation.ContainerRecommendations[i] = previousContainerRecommendation
}
}
}
}

vpa.UpdateRecommendation(listOfResourceRecommendation)
if vpa.HasRecommendation() && !had {
metrics_recommender.ObserveRecommendationLatency(vpa.Created)
Expand Down

0 comments on commit 811fa14

Please sign in to comment.