Skip to content

Commit

Permalink
Remarks
Browse files Browse the repository at this point in the history
  • Loading branch information
mimowo committed Jan 30, 2023
1 parent 3cdd581 commit aaf12e6
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 35 deletions.
18 changes: 9 additions & 9 deletions crd/kubeflow.org_mpijobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7886,15 +7886,15 @@ spec:
type: string
suspend:
default: false
description: "suspend specifies whether the Job controller should
create Pods or not. If a Job is created with suspend set to true,
no Pods are created by the Job controller. If a Job is suspended
after creation (i.e. the flag goes from false to true), the Job
controller will delete all active Pods associated with this Job.
Users must design their workload to gracefully handle this. Suspending
a Job will reset the StartTime field of the Job, effectively resetting
the ActiveDeadlineSeconds timer too. Defaults to false. \n Defaults
to false."
description: "suspend specifies whether the MPIJob controller should
create Pods or not. If a MPIJob is created with suspend set to true,
no Pods are created by the MPIJob controller. If a MPIJob is suspended
after creation (i.e. the flag goes from false to true), the MPIJob
controller will delete all active Pods associated with this MPIJob.
Also, it will suspend the Launcher Job. Users must design their
workload to gracefully handle this. Suspending a Job will reset
the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds
timer too. Defaults to false. \n Defaults to false."
type: boolean
required:
- mpiReplicaSpecs
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/kubeflow/v2beta1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@
"type": "string"
},
"suspend": {
"description": "suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job, effectively resetting the ActiveDeadlineSeconds timer too. Defaults to false.\n\nDefaults to false.",
"description": "suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds timer too. Defaults to false.\n\nDefaults to false.",
"type": "boolean"
}
}
Expand Down
15 changes: 8 additions & 7 deletions pkg/apis/kubeflow/v2beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,14 @@ type MPIJobSpec struct {
// +kubebuilder:default:=OpenMPI
MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`

// suspend specifies whether the Job controller should create Pods or not. If
// a Job is created with suspend set to true, no Pods are created by the Job
// controller. If a Job is suspended after creation (i.e. the flag goes from
// false to true), the Job controller will delete all active Pods associated
// with this Job. Users must design their workload to gracefully handle this.
// Suspending a Job will reset the StartTime field of the Job, effectively
// resetting the ActiveDeadlineSeconds timer too. Defaults to false.
// suspend specifies whether the MPIJob controller should create Pods or not.
// If a MPIJob is created with suspend set to true, no Pods are created by
// the MPIJob controller. If a MPIJob is suspended after creation (i.e. the
// flag goes from false to true), the MPIJob controller will delete all
// active Pods associated with this MPIJob. Also, it will suspend the
// Launcher Job. Users must design their workload to gracefully handle this.
// Suspending a Job will reset the StartTime field of the MPIJob, effectively
// resetting the activeDeadlineSeconds timer too. Defaults to false.
//
// Defaults to false.
// +kubebuilder:default:=false
Expand Down
21 changes: 6 additions & 15 deletions pkg/controller/mpi_job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,6 @@ func (c *MPIJobController) processNextWorkItem() bool {
// converge the two. It then updates the Status block of the MPIJob resource
// with the current status of the resource.
func (c *MPIJobController) syncHandler(key string) error {
klog.Infof("___ MYDEBUG starting for %s", key)
startTime := time.Now()
defer func() {
klog.Infof("Finished syncing job %q (%v)", key, time.Since(startTime))
Expand Down Expand Up @@ -505,7 +504,7 @@ func (c *MPIJobController) syncHandler(key string) error {
// cleanup and stop retrying the MPIJob.
if isFinished(mpiJob.Status) && mpiJob.Status.CompletionTime != nil {
if isCleanUpPods(mpiJob.Spec.RunPolicy.CleanPodPolicy) {
return cleanUpPods(mpiJob, c)
return cleanUpWorkerPods(mpiJob, c)
}
return nil
}
Expand Down Expand Up @@ -578,17 +577,9 @@ func (c *MPIJobController) syncHandler(key string) error {
}

if launcher != nil {
launcherSuspendUpdate := false
if isMPIJobSuspended(mpiJob) && !isJobSuspended(launcher) {
// suspend the launcher first if the MPI job is suspended
launcherSuspendUpdate = true
launcher.Spec.Suspend = pointer.Bool(true)
} else if !isMPIJobSuspended(mpiJob) && isJobSuspended(launcher) {
launcherSuspendUpdate = true
// unsuspend the launcher first if the MPI job is unsuspended
launcher.Spec.Suspend = pointer.Bool(false)
}
if launcherSuspendUpdate {
if isMPIJobSuspended(mpiJob) != isJobSuspended(launcher) {
// align the suspension state of launcher with the MPIJob
launcher.Spec.Suspend = pointer.Bool(isMPIJobSuspended(mpiJob))
if _, err := c.kubeClient.BatchV1().Jobs(namespace).Update(context.TODO(), launcher, metav1.UpdateOptions{}); err != nil {
return err
}
Expand All @@ -597,14 +588,14 @@ func (c *MPIJobController) syncHandler(key string) error {

// cleanup the running worker pods if the MPI job is suspended
if isMPIJobSuspended(mpiJob) {
if err := cleanUpPods(mpiJob, c); err != nil {
if err := cleanUpWorkerPods(mpiJob, c); err != nil {
return err
}
}
return nil
}

func cleanUpPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error {
func cleanUpWorkerPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error {
// set worker StatefulSet Replicas to 0.
if err := c.deleteWorkerPods(mpiJob); err != nil {
return err
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit aaf12e6

Please sign in to comment.