Skip to content

Commit

Permalink
Extend PodSetTopologyRequest API by index labels
Browse files Browse the repository at this point in the history
  • Loading branch information
PBundyra committed Dec 2, 2024
1 parent 0e4e2f4 commit 4bb02d4
Show file tree
Hide file tree
Showing 36 changed files with 393 additions and 172 deletions.
15 changes: 15 additions & 0 deletions apis/kueue/v1beta1/workload_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,21 @@ type PodSetTopologyRequest struct {
//
// +optional
Preferred *string `json:"preferred,omitempty"`

// PodIndexLabel indicates the name of the label indexing the pods.
// For example, in the context of
// - kubernetes job this is: kubernetes.io/job-completion-index
// - JobSet: kubernetes.io/job-completion-index (inherited from Job)
// - Kubeflow: training.kubeflow.org/replica-index
PodIndexLabel *string `json:"podIndexLabel,omitempty"`

// SubGroupIndexLabel indicates the name of the label indexing the instances of replicated Jobs (groups)
// within a PodSet. For example, in the context of JobSet this is jobset.sigs.k8s.io/job-index.
SubGroupIndexLabel *string `json:"subGroupIndexLabel,omitempty"`

// SubGroupIndexLabel indicates the count of replicated Jobs (groups) within a PodSet.
// For example, in the context of JobSet this value is read from jobset.sigs.k8s.io/replicatedjob-replicas.
SubGroupCount *int32 `json:"subGroupCount,omitempty"`
}

type Admission struct {
Expand Down
15 changes: 15 additions & 0 deletions apis/kueue/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions charts/kueue/templates/crd/kueue.x-k8s.io_workloads.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8196,6 +8196,14 @@ spec:
description: topologyRequest defines the topology request for
the PodSet.
properties:
podIndexLabel:
description: |-
PodIndexLabel indicates the name of the label indexing the pods.
For example, in the context of
- kubernetes job this is: kubernetes.io/job-completion-index
- JobSet: kubernetes.io/job-completion-index (inherited from Job)
- Kubeflow: training.kubeflow.org/replica-index
type: string
preferred:
description: |-
preferred indicates the topology level preferred by the PodSet, as
Expand All @@ -8208,6 +8216,17 @@ spec:
indicated by the `kueue.x-k8s.io/podset-required-topology` PodSet
annotation.
type: string
subGroupCount:
description: |-
SubGroupIndexLabel indicates the count of replicated Jobs (groups) within a PodSet.
For example, in the context of JobSet this value is read from jobset.sigs.k8s.io/replicatedjob-replicas.
format: int32
type: integer
subGroupIndexLabel:
description: |-
SubGroupIndexLabel indicates the name of the label indexing the instances of replicated Jobs (groups)
within a PodSet. For example, in the context of JobSet this is jobset.sigs.k8s.io/job-index.
type: string
type: object
required:
- count
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions config/components/crd/bases/kueue.x-k8s.io_workloads.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8181,6 +8181,14 @@ spec:
description: topologyRequest defines the topology request for
the PodSet.
properties:
podIndexLabel:
description: |-
PodIndexLabel indicates the name of the label indexing the pods.
For example, in the context of
- kubernetes job this is: kubernetes.io/job-completion-index
- JobSet: kubernetes.io/job-completion-index (inherited from Job)
- Kubeflow: training.kubeflow.org/replica-index
type: string
preferred:
description: |-
preferred indicates the topology level preferred by the PodSet, as
Expand All @@ -8193,6 +8201,17 @@ spec:
indicated by the `kueue.x-k8s.io/podset-required-topology` PodSet
annotation.
type: string
subGroupCount:
description: |-
SubGroupIndexLabel indicates the count of replicated Jobs (groups) within a PodSet.
For example, in the context of JobSet this value is read from jobset.sigs.k8s.io/replicatedjob-replicas.
format: int32
type: integer
subGroupIndexLabel:
description: |-
SubGroupIndexLabel indicates the name of the label indexing the instances of replicated Jobs (groups)
within a PodSet. For example, in the context of JobSet this is jobset.sigs.k8s.io/job-index.
type: string
type: object
required:
- count
Expand Down
15 changes: 15 additions & 0 deletions keps/2724-topology-aware-scheduling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,21 @@ type PodSetTopologyRequest struct {
//
// +optional
Preferred *string `json:"preferred,omitempty"`

// PodIndexLabel indicates the name of the label indexing the pods.
// For example, in the context of
// - kubernetes job this is: kubernetes.io/job-completion-index
// - JobSet: kubernetes.io/job-completion-index (inherited from Job)
// - Kubeflow: training.kubeflow.org/replica-index
PodIndexLabel *string

// SubGroupIndexLabel indicates the name of the label indexing the instances of replicated Jobs (groups)
// within a PodSet. For example, in the context of JobSet this is jobset.sigs.k8s.io/job-index.
SubGroupIndexLabel *string

// SubGroupIndexLabel indicates the count of replicated Jobs (groups) within a PodSet.
// For example, in the context of JobSet this value is read from jobset.sigs.k8s.io/replicatedjob-replicas.
SubGroupCount *int32
}
```

Expand Down
19 changes: 11 additions & 8 deletions pkg/controller/jobframework/tas.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,27 @@ package jobframework

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1"
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
)

func PodSetTopologyRequest(meta *metav1.ObjectMeta) *kueue.PodSetTopologyRequest {
func PodSetTopologyRequest(meta *metav1.ObjectMeta, podIndexLabel *string, subGroupIndexLabel *string, subGroupCount *int32) *kueue.PodSetTopologyRequest {
psTopologyReq := &kueue.PodSetTopologyRequest{
PodIndexLabel: podIndexLabel,
SubGroupIndexLabel: subGroupIndexLabel,
SubGroupCount: subGroupCount,
}

requiredValue, requiredFound := meta.Annotations[kueuealpha.PodSetRequiredTopologyAnnotation]
if requiredFound {
return &kueue.PodSetTopologyRequest{
Required: ptr.To(requiredValue),
}
psTopologyReq.Required = &requiredValue
return psTopologyReq
}
preferredValue, preferredFound := meta.Annotations[kueuealpha.PodSetPreferredTopologyAnnotation]
if preferredFound {
return &kueue.PodSetTopologyRequest{
Preferred: ptr.To(preferredValue),
}
psTopologyReq.Preferred = &preferredValue
return psTopologyReq
}
return nil
}
11 changes: 6 additions & 5 deletions pkg/controller/jobs/job/job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,12 @@ func cleanManagedLabels(pt *corev1.PodTemplateSpec) *corev1.PodTemplateSpec {
func (j *Job) PodSets() []kueue.PodSet {
return []kueue.PodSet{
{
Name: kueue.DefaultPodSetName,
Template: *cleanManagedLabels(j.Spec.Template.DeepCopy()),
Count: j.podsCount(),
MinCount: j.minPodsCount(),
TopologyRequest: jobframework.PodSetTopologyRequest(&j.Spec.Template.ObjectMeta),
Name: kueue.DefaultPodSetName,
Template: *cleanManagedLabels(j.Spec.Template.DeepCopy()),
Count: j.podsCount(),
MinCount: j.minPodsCount(),
TopologyRequest: jobframework.PodSetTopologyRequest(&j.Spec.Template.ObjectMeta,
ptr.To(batchv1.JobCompletionIndexAnnotation), nil, nil),
},
}
}
Expand Down
10 changes: 6 additions & 4 deletions pkg/controller/jobs/job/job_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,9 @@ func TestPodSets(t *testing.T) {
Template: jobTemplate.Clone().
PodAnnotation(kueuealpha.PodSetRequiredTopologyAnnotation, "cloud.com/block").
Spec.Template,
Count: 3,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/block")},
Count: 3,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(batchv1.JobCompletionIndexAnnotation)},
},
},
},
Expand All @@ -387,8 +388,9 @@ func TestPodSets(t *testing.T) {
Template: jobTemplate.Clone().
PodAnnotation(kueuealpha.PodSetPreferredTopologyAnnotation, "cloud.com/block").
Spec.Template,
Count: 3,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block")},
Count: 3,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(batchv1.JobCompletionIndexAnnotation)},
}},
},
}
Expand Down
11 changes: 7 additions & 4 deletions pkg/controller/jobs/jobset/jobset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"strings"

batchv1 "k8s.io/api/batch/v1"
apimeta "k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -118,10 +119,12 @@ func (j *JobSet) PodSets() []kueue.PodSet {
podSets := make([]kueue.PodSet, len(j.Spec.ReplicatedJobs))
for index, replicatedJob := range j.Spec.ReplicatedJobs {
podSets[index] = kueue.PodSet{
Name: replicatedJob.Name,
Template: *replicatedJob.Template.Spec.Template.DeepCopy(),
Count: podsCount(&replicatedJob),
TopologyRequest: jobframework.PodSetTopologyRequest(&replicatedJob.Template.Spec.Template.ObjectMeta),
Name: replicatedJob.Name,
Template: *replicatedJob.Template.Spec.Template.DeepCopy(),
Count: podsCount(&replicatedJob),
TopologyRequest: jobframework.PodSetTopologyRequest(&replicatedJob.Template.Spec.Template.ObjectMeta,
ptr.To(batchv1.JobCompletionIndexAnnotation), ptr.To(jobsetapi.JobIndexKey),
ptr.To(replicatedJob.Replicas)),
}
}
return podSets
Expand Down
25 changes: 17 additions & 8 deletions pkg/controller/jobs/jobset/jobset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
Expand Down Expand Up @@ -245,10 +246,14 @@ func TestPodSets(t *testing.T) {
wantPodSets: func(jobSet *JobSet) []kueue.PodSet {
return []kueue.PodSet{
{
Name: jobSet.Spec.ReplicatedJobs[0].Name,
Template: *jobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.DeepCopy(),
Count: 2,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/block")},
Name: jobSet.Spec.ReplicatedJobs[0].Name,
Template: *jobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.DeepCopy(),
Count: 2,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(batchv1.JobCompletionIndexAnnotation),
SubGroupIndexLabel: ptr.To(jobset.JobIndexKey),
SubGroupCount: ptr.To[int32](2),
},
},
{
Name: jobSet.Spec.ReplicatedJobs[1].Name,
Expand Down Expand Up @@ -281,10 +286,14 @@ func TestPodSets(t *testing.T) {
Count: 2,
},
{
Name: jobSet.Spec.ReplicatedJobs[1].Name,
Template: *jobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.DeepCopy(),
Count: 6,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block")},
Name: jobSet.Spec.ReplicatedJobs[1].Name,
Template: *jobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.DeepCopy(),
Count: 6,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(batchv1.JobCompletionIndexAnnotation),
SubGroupIndexLabel: ptr.To(jobset.JobIndexKey),
SubGroupCount: ptr.To[int32](3),
},
},
}
},
Expand Down
19 changes: 11 additions & 8 deletions pkg/controller/jobs/kubeflow/jobs/mxjob/mxjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,16 +333,19 @@ func TestPodSets(t *testing.T) {
wantPodSets: func(job *kftraining.MXJob) []kueue.PodSet {
return []kueue.PodSet{
{
Name: strings.ToLower(string(kftraining.MXJobReplicaTypeScheduler)),
Template: job.Spec.MXReplicaSpecs[kftraining.MXJobReplicaTypeScheduler].Template,
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/rack")},
Name: strings.ToLower(string(kftraining.MXJobReplicaTypeScheduler)),
Template: job.Spec.MXReplicaSpecs[kftraining.MXJobReplicaTypeScheduler].Template,
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/rack"),
PodIndexLabel: ptr.To(kftraining.ReplicaIndexLabel)},
},
{
Name: strings.ToLower(string(kftraining.MXJobReplicaTypeServer)),
Template: job.Spec.MXReplicaSpecs[kftraining.MXJobReplicaTypeServer].Template,
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block")},
Name: strings.ToLower(string(kftraining.MXJobReplicaTypeServer)),
Template: job.Spec.MXReplicaSpecs[kftraining.MXJobReplicaTypeServer].Template,
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(kftraining.ReplicaIndexLabel),
},
},
{
Name: strings.ToLower(string(kftraining.MXJobReplicaTypeWorker)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,16 +286,18 @@ func TestPodSets(t *testing.T) {
wantPodSets: func(job *kftraining.PaddleJob) []kueue.PodSet {
return []kueue.PodSet{
{
Name: strings.ToLower(string(kftraining.PaddleJobReplicaTypeMaster)),
Template: job.Spec.PaddleReplicaSpecs[kftraining.PaddleJobReplicaTypeMaster].Template,
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/rack")},
Name: strings.ToLower(string(kftraining.PaddleJobReplicaTypeMaster)),
Template: job.Spec.PaddleReplicaSpecs[kftraining.PaddleJobReplicaTypeMaster].Template,
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/rack"),
PodIndexLabel: ptr.To(kftraining.ReplicaIndexLabel)},
},
{
Name: strings.ToLower(string(kftraining.PaddleJobReplicaTypeWorker)),
Template: job.Spec.PaddleReplicaSpecs[kftraining.PaddleJobReplicaTypeWorker].Template,
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block")},
Name: strings.ToLower(string(kftraining.PaddleJobReplicaTypeWorker)),
Template: job.Spec.PaddleReplicaSpecs[kftraining.PaddleJobReplicaTypeWorker].Template,
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(kftraining.ReplicaIndexLabel)},
},
}
},
Expand Down
Loading

0 comments on commit 4bb02d4

Please sign in to comment.