Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Graduate MultiplePreemptions to Stable #3602

Merged
merged 1 commit into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions pkg/cache/clusterqueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,19 +121,6 @@ type queue struct {
admittedUsage resources.FlavorResourceQuantities
}

// FitInCohort supports the legacy
// features.MultiplePreemptions=false path. It doesn't take into
// account BorrowingLimits. To be cleaned up in v0.10, when we delete
// the old code.
func (c *ClusterQueueSnapshot) FitInCohort(q resources.FlavorResourceQuantities) bool {
for fr, value := range q {
if available(c, fr, false) < value {
return false
}
}
return true
}

func (c *clusterQueue) Active() bool {
return c.Status == active
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cache/clusterqueue_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ func (c *ClusterQueueSnapshot) BorrowingWith(fr resources.FlavorResource, val in
// Cohort. When the ClusterQueue/Cohort is in debt, Available
// will return 0.
func (c *ClusterQueueSnapshot) Available(fr resources.FlavorResource) int64 {
return max(0, available(c, fr, true))
return max(0, available(c, fr))
}

// PotentialAvailable returns the largest workload this ClusterQueue could
Expand Down
303 changes: 0 additions & 303 deletions pkg/cache/clusterqueue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,309 +94,6 @@ func TestClusterQueueUpdateWithFlavors(t *testing.T) {
}
}

func TestFitInCohort(t *testing.T) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can try to salvage these tests, if you think they're worth keeping. Let me know what you think, reviewers

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, good question, the function is gone so we need to delete the tests for it, but maybe some of the low-level tests provide coverage for scenarios not handled by higher level tests.

Maybe you could run the following experiment: checkout main branch before this PR and generate coverage for TestFitInCohort, then checkout main and run coverage. check if there are any relevant branches which were covered before but now are missed.

cases := map[string]struct {
request resources.FlavorResourceQuantities
wantFit bool
usage resources.FlavorResourceQuantities
clusterQueue []*kueue.ClusterQueue
disableLendingLimit bool
}{
"full cohort, empty request": {
wantFit: true,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 5_000,
{Flavor: "f1", Resource: corev1.ResourceMemory}: 5,
{Flavor: "f2", Resource: corev1.ResourceCPU}: 5_000,
{Flavor: "f2", Resource: corev1.ResourceMemory}: 5,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
*utiltesting.MakeFlavorQuotas("f1").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
*utiltesting.MakeFlavorQuotas("f2").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
).
Cohort("C").
Obj(),
},
},
"can fit": {
request: resources.FlavorResourceQuantities{
{Flavor: "f2", Resource: corev1.ResourceCPU}: 1_000,
{Flavor: "f2", Resource: corev1.ResourceMemory}: 1,
},
wantFit: true,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 5_000,
{Flavor: "f1", Resource: corev1.ResourceMemory}: 5,
{Flavor: "f2", Resource: corev1.ResourceCPU}: 4_000,
{Flavor: "f2", Resource: corev1.ResourceMemory}: 4,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
*utiltesting.MakeFlavorQuotas("f1").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
*utiltesting.MakeFlavorQuotas("f2").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
).
Cohort("C").
Obj(),
},
},
"full cohort, none fit": {
request: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 1_000,
{Flavor: "f1", Resource: corev1.ResourceMemory}: 1,
{Flavor: "f2", Resource: corev1.ResourceCPU}: 1_000,
{Flavor: "f2", Resource: corev1.ResourceMemory}: 1,
},
wantFit: false,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 5_000,
{Flavor: "f1", Resource: corev1.ResourceMemory}: 5,
{Flavor: "f2", Resource: corev1.ResourceCPU}: 5_000,
{Flavor: "f2", Resource: corev1.ResourceMemory}: 5,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
*utiltesting.MakeFlavorQuotas("f1").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
*utiltesting.MakeFlavorQuotas("f2").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
).
Cohort("C").
Obj(),
},
},
"one cannot fit": {
request: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 1_000,
{Flavor: "f1", Resource: corev1.ResourceMemory}: 1,
{Flavor: "f2", Resource: corev1.ResourceCPU}: 2_000,
{Flavor: "f2", Resource: corev1.ResourceMemory}: 1,
},
wantFit: false,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 4_000,
{Flavor: "f1", Resource: corev1.ResourceMemory}: 4,
{Flavor: "f2", Resource: corev1.ResourceCPU}: 4_000,
{Flavor: "f2", Resource: corev1.ResourceMemory}: 4,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
*utiltesting.MakeFlavorQuotas("f1").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
*utiltesting.MakeFlavorQuotas("f2").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
).
Cohort("C").
Obj(),
},
},
"missing flavor": {
request: resources.FlavorResourceQuantities{
{Flavor: "non-existent-flavor", Resource: corev1.ResourceCPU}: 1_000,
{Flavor: "non-existent-flavor", Resource: corev1.ResourceMemory}: 1,
},
wantFit: false,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 5_000,
{Flavor: "f1", Resource: corev1.ResourceMemory}: 5,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
*utiltesting.MakeFlavorQuotas("f1").
Resource(corev1.ResourceCPU, "5").
Resource(corev1.ResourceMemory, "5").
Obj(),
).
Cohort("C").
Obj(),
},
},
"missing resource": {
request: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 1_000,
{Flavor: "f1", Resource: corev1.ResourceMemory}: 1,
},
wantFit: false,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 3_000,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
*utiltesting.MakeFlavorQuotas("f1").
Resource(corev1.ResourceCPU, "5").
Obj(),
).
Cohort("C").
Obj(),
},
},
"lendingLimit can't fit": {
request: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 3_000,
},
wantFit: false,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 2_000,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
utiltesting.MakeFlavorQuotas("f1").
ResourceQuotaWrapper(corev1.ResourceCPU).
NominalQuota("2").
Append().
FlavorQuotas,
).
Cohort("C").
Obj(),
utiltesting.
MakeClusterQueue("CQ-B").
ResourceGroup(
utiltesting.MakeFlavorQuotas("f1").
ResourceQuotaWrapper(corev1.ResourceCPU).
NominalQuota("3").
LendingLimit("2").
Append().
FlavorQuotas,
).
Cohort("C").
Obj(),
},
},
"lendingLimit should not affect the fit when feature disabled": {
disableLendingLimit: true,
request: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 3_000,
},
wantFit: true,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 2_000,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
utiltesting.MakeFlavorQuotas("f1").
ResourceQuotaWrapper(corev1.ResourceCPU).
NominalQuota("2").
Append().
FlavorQuotas,
).
Cohort("C").
Obj(),
utiltesting.
MakeClusterQueue("CQ-B").
ResourceGroup(
utiltesting.MakeFlavorQuotas("f1").
ResourceQuotaWrapper(corev1.ResourceCPU).
NominalQuota("3").
LendingLimit("2").
Append().
FlavorQuotas,
).
Cohort("C").
Obj(),
},
},
"lendingLimit can fit": {
request: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 3_000,
},
wantFit: true,
usage: resources.FlavorResourceQuantities{
{Flavor: "f1", Resource: corev1.ResourceCPU}: 1_000,
},
clusterQueue: []*kueue.ClusterQueue{
utiltesting.
MakeClusterQueue("CQ").
ResourceGroup(
utiltesting.MakeFlavorQuotas("f1").
ResourceQuotaWrapper(corev1.ResourceCPU).
NominalQuota("2").
Append().
FlavorQuotas,
).
Cohort("C").
Obj(),
utiltesting.
MakeClusterQueue("CQ-B").
ResourceGroup(
utiltesting.MakeFlavorQuotas("f1").
ResourceQuotaWrapper(corev1.ResourceCPU).
NominalQuota("3").
LendingLimit("2").
Append().
FlavorQuotas,
).
Cohort("C").
Obj(),
},
},
}

for name, tc := range cases {
t.Run(name, func(t *testing.T) {
ctx, _ := utiltesting.ContextWithLog(t)
if tc.disableLendingLimit {
features.SetFeatureGateDuringTest(t, features.LendingLimit, false)
}
cache := New(utiltesting.NewFakeClient())

cache.AddOrUpdateResourceFlavor(utiltesting.MakeResourceFlavor("f1").Obj())
cache.AddOrUpdateResourceFlavor(utiltesting.MakeResourceFlavor("f2").Obj())

for _, cq := range tc.clusterQueue {
_ = cache.AddClusterQueue(ctx, cq)
}

snapshot, err := cache.Snapshot(ctx)
if err != nil {
t.Fatalf("unexpected error while building snapshot: %v", err)
}
cq := snapshot.ClusterQueues["CQ"]
cq.AddUsage(tc.usage)

got := cq.FitInCohort(tc.request)
if got != tc.wantFit {
t.Errorf("Unexpected result, %v", got)
}
})
}
}

func TestClusterQueueUpdate(t *testing.T) {
resourceFlavors := []*kueue.ResourceFlavor{
{ObjectMeta: metav1.ObjectMeta{Name: "on-demand"}},
Expand Down
10 changes: 3 additions & 7 deletions pkg/cache/resource_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,19 +84,15 @@ type hierarchicalResourceNode interface {
// This function may return a negative number in the case of
// overadmission - e.g. capacity was removed or the node moved to
// another Cohort.
//
// We add an option to ignore the borrowing limit, to support
// features.MultiplePreemptions=false path. This will be cleaned up in
// v0.10, when we delete legacy logic.
func available(node hierarchicalResourceNode, fr resources.FlavorResource, enforceBorrowLimit bool) int64 {
func available(node hierarchicalResourceNode, fr resources.FlavorResource) int64 {
r := node.getResourceNode()
if !node.HasParent() {
return r.SubtreeQuota[fr] - r.Usage[fr]
}
localAvailable := max(0, r.guaranteedQuota(fr)-r.Usage[fr])
parentAvailable := available(node.parentHRN(), fr, enforceBorrowLimit)
parentAvailable := available(node.parentHRN(), fr)

if borrowingLimit := r.Quotas[fr].BorrowingLimit; enforceBorrowLimit && borrowingLimit != nil {
if borrowingLimit := r.Quotas[fr].BorrowingLimit; borrowingLimit != nil {
storedInParent := r.SubtreeQuota[fr] - r.guaranteedQuota(fr)
usedInParent := max(0, r.Usage[fr]-r.guaranteedQuota(fr))
withMaxFromParent := storedInParent - usedInParent + *borrowingLimit
Expand Down
5 changes: 4 additions & 1 deletion pkg/features/kube_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ const (
// owner: @gabesaba
// alpha: v0.8
// beta: v0.9
// stable: v0.10
//
// remove in v0.12
//
// Enable more than one workload sharing flavors to preempt within a Cohort,
// as long as the preemption targets don't overlap.
Expand Down Expand Up @@ -170,7 +173,7 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
MultiKueue: {Default: true, PreRelease: featuregate.Beta},
LendingLimit: {Default: true, PreRelease: featuregate.Beta},
MultiKueueBatchJobWithManagedBy: {Default: false, PreRelease: featuregate.Alpha},
MultiplePreemptions: {Default: true, PreRelease: featuregate.Beta},
MultiplePreemptions: {Default: true, PreRelease: featuregate.GA},
TopologyAwareScheduling: {Default: false, PreRelease: featuregate.Alpha},
ConfigurableResourceTransformations: {Default: true, PreRelease: featuregate.Beta},
WorkloadResourceRequestsSummary: {Default: true, PreRelease: featuregate.Beta},
Expand Down
Loading