Skip to content

Commit

Permalink
Add MPI Job support, avoid redundant allocation
Browse files Browse the repository at this point in the history
  • Loading branch information
PBundyra committed Dec 3, 2024
1 parent 4bb02d4 commit c8c60b3
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 19 deletions.
24 changes: 12 additions & 12 deletions pkg/controller/jobframework/tas.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,20 @@ import (
)

func PodSetTopologyRequest(meta *metav1.ObjectMeta, podIndexLabel *string, subGroupIndexLabel *string, subGroupCount *int32) *kueue.PodSetTopologyRequest {
psTopologyReq := &kueue.PodSetTopologyRequest{
PodIndexLabel: podIndexLabel,
SubGroupIndexLabel: subGroupIndexLabel,
SubGroupCount: subGroupCount,
}

requiredValue, requiredFound := meta.Annotations[kueuealpha.PodSetRequiredTopologyAnnotation]
if requiredFound {
psTopologyReq.Required = &requiredValue
return psTopologyReq
}
preferredValue, preferredFound := meta.Annotations[kueuealpha.PodSetPreferredTopologyAnnotation]
if preferredFound {
psTopologyReq.Preferred = &preferredValue

if requiredFound || preferredFound {
psTopologyReq := &kueue.PodSetTopologyRequest{
PodIndexLabel: podIndexLabel,
SubGroupIndexLabel: subGroupIndexLabel,
SubGroupCount: subGroupCount,
}
if requiredFound {
psTopologyReq.Required = &requiredValue
} else {
psTopologyReq.Preferred = &preferredValue
}
return psTopologyReq
}
return nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/jobs/mpijob/mpijob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func (j *MPIJob) PodSets() []kueue.PodSet {
Name: strings.ToLower(string(mpiReplicaType)),
Template: *j.Spec.MPIReplicaSpecs[mpiReplicaType].Template.DeepCopy(),
Count: podsCount(&j.Spec, mpiReplicaType),
TopologyRequest: jobframework.PodSetTopologyRequest(&j.Spec.MPIReplicaSpecs[mpiReplicaType].Template.ObjectMeta, nil, nil, nil),
TopologyRequest: jobframework.PodSetTopologyRequest(&j.Spec.MPIReplicaSpecs[mpiReplicaType].Template.ObjectMeta, ptr.To(kfmpi.ReplicaIndexLabel), nil, nil),
}
}
return podSets
Expand Down
16 changes: 12 additions & 4 deletions pkg/controller/jobs/mpijob/mpijob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,9 @@ func TestPodSets(t *testing.T) {
"cloud.com/block",
).
Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/block")},
TopologyRequest: &kueue.PodSetTopologyRequest{
Required: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(kfmpi.ReplicaIndexLabel)},
},
{
Name: strings.ToLower(string(kfmpi.MPIReplicaTypeWorker)),
Expand All @@ -263,7 +265,9 @@ func TestPodSets(t *testing.T) {
"cloud.com/block",
).
Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template,
TopologyRequest: &kueue.PodSetTopologyRequest{Required: ptr.To("cloud.com/block")},
TopologyRequest: &kueue.PodSetTopologyRequest{
Required: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(kfmpi.ReplicaIndexLabel)},
},
},
},
Expand Down Expand Up @@ -298,7 +302,9 @@ func TestPodSets(t *testing.T) {
"cloud.com/block",
).
Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block")},
TopologyRequest: &kueue.PodSetTopologyRequest{
Preferred: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(kfmpi.ReplicaIndexLabel)},
},
{
Name: strings.ToLower(string(kfmpi.MPIReplicaTypeWorker)),
Expand All @@ -316,7 +322,9 @@ func TestPodSets(t *testing.T) {
"cloud.com/block",
).
Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template,
TopologyRequest: &kueue.PodSetTopologyRequest{Preferred: ptr.To("cloud.com/block")},
TopologyRequest: &kueue.PodSetTopologyRequest{
Preferred: ptr.To("cloud.com/block"),
PodIndexLabel: ptr.To(kfmpi.ReplicaIndexLabel)},
},
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1019,14 +1019,16 @@ var _ = ginkgo.Describe("MPIJob controller when TopologyAwareScheduling enabled"
Name: strings.ToLower(string(kfmpi.MPIReplicaTypeLauncher)),
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{
Required: ptr.To(tasBlockLabel),
Required: ptr.To(tasBlockLabel),
PodIndexLabel: ptr.To(kfmpi.ReplicaIndexLabel),
},
},
{
Name: strings.ToLower(string(kfmpi.MPIReplicaTypeWorker)),
Count: 1,
TopologyRequest: &kueue.PodSetTopologyRequest{
Preferred: ptr.To(tasRackLabel),
Preferred: ptr.To(tasRackLabel),
PodIndexLabel: ptr.To(kfmpi.ReplicaIndexLabel),
},
},
}, cmpopts.IgnoreFields(kueue.PodSet{}, "Template")))
Expand Down

0 comments on commit c8c60b3

Please sign in to comment.