diff --git a/crd/kubeflow.org_mpijobs.yaml b/crd/kubeflow.org_mpijobs.yaml index eb44e127..92f831df 100644 --- a/crd/kubeflow.org_mpijobs.yaml +++ b/crd/kubeflow.org_mpijobs.yaml @@ -7886,15 +7886,15 @@ spec: type: string suspend: default: false - description: "suspend specifies whether the Job controller should - create Pods or not. If a Job is created with suspend set to true, - no Pods are created by the Job controller. If a Job is suspended - after creation (i.e. the flag goes from false to true), the Job - controller will delete all active Pods associated with this Job. - Users must design their workload to gracefully handle this. Suspending - a Job will reset the StartTime field of the Job, effectively resetting - the ActiveDeadlineSeconds timer too. Defaults to false. \n Defaults - to false." + description: "suspend specifies whether the MPIJob controller should + create Pods or not. If a MPIJob is created with suspend set to true, + no Pods are created by the MPIJob controller. If a MPIJob is suspended + after creation (i.e. the flag goes from false to true), the MPIJob + controller will delete all active Pods associated with this MPIJob. + Also, it will suspend the Launcher Job. Users must design their + workload to gracefully handle this. Suspending a Job will reset + the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds + timer too. Defaults to false. \n Defaults to false." type: boolean required: - mpiReplicaSpecs diff --git a/pkg/apis/kubeflow/v2beta1/swagger.json b/pkg/apis/kubeflow/v2beta1/swagger.json index e5f2fb1f..b83b4447 100644 --- a/pkg/apis/kubeflow/v2beta1/swagger.json +++ b/pkg/apis/kubeflow/v2beta1/swagger.json @@ -272,7 +272,7 @@ "type": "string" }, "suspend": { - "description": "suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job, effectively resetting the ActiveDeadlineSeconds timer too. Defaults to false.\n\nDefaults to false.", + "description": "suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds timer too. Defaults to false.\n\nDefaults to false.", "type": "boolean" } } diff --git a/pkg/apis/kubeflow/v2beta1/types.go b/pkg/apis/kubeflow/v2beta1/types.go index 89370905..2ab6b89f 100644 --- a/pkg/apis/kubeflow/v2beta1/types.go +++ b/pkg/apis/kubeflow/v2beta1/types.go @@ -63,13 +63,14 @@ type MPIJobSpec struct { // +kubebuilder:default:=OpenMPI MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"` - // suspend specifies whether the Job controller should create Pods or not. If - // a Job is created with suspend set to true, no Pods are created by the Job - // controller. If a Job is suspended after creation (i.e. the flag goes from - // false to true), the Job controller will delete all active Pods associated - // with this Job. Users must design their workload to gracefully handle this. - // Suspending a Job will reset the StartTime field of the Job, effectively - // resetting the ActiveDeadlineSeconds timer too. Defaults to false. + // suspend specifies whether the MPIJob controller should create Pods or not. + // If a MPIJob is created with suspend set to true, no Pods are created by + // the MPIJob controller. If a MPIJob is suspended after creation (i.e. the + // flag goes from false to true), the MPIJob controller will delete all + // active Pods associated with this MPIJob. Also, it will suspend the + // Launcher Job. Users must design their workload to gracefully handle this. + // Suspending a Job will reset the StartTime field of the MPIJob, effectively + // resetting the activeDeadlineSeconds timer too. Defaults to false. // // Defaults to false. // +kubebuilder:default:=false diff --git a/pkg/controller/mpi_job_controller.go b/pkg/controller/mpi_job_controller.go index 2f52e01e..7698d0d7 100644 --- a/pkg/controller/mpi_job_controller.go +++ b/pkg/controller/mpi_job_controller.go @@ -450,7 +450,6 @@ func (c *MPIJobController) processNextWorkItem() bool { // converge the two. It then updates the Status block of the MPIJob resource // with the current status of the resource. func (c *MPIJobController) syncHandler(key string) error { - klog.Infof("___ MYDEBUG starting for %s", key) startTime := time.Now() defer func() { klog.Infof("Finished syncing job %q (%v)", key, time.Since(startTime)) @@ -505,7 +504,7 @@ func (c *MPIJobController) syncHandler(key string) error { // cleanup and stop retrying the MPIJob. if isFinished(mpiJob.Status) && mpiJob.Status.CompletionTime != nil { if isCleanUpPods(mpiJob.Spec.RunPolicy.CleanPodPolicy) { - return cleanUpPods(mpiJob, c) + return cleanUpWorkerPods(mpiJob, c) } return nil } @@ -578,17 +577,9 @@ func (c *MPIJobController) syncHandler(key string) error { } if launcher != nil { - launcherSuspendUpdate := false - if isMPIJobSuspended(mpiJob) && !isJobSuspended(launcher) { - // suspend the launcher first if the MPI job is suspended - launcherSuspendUpdate = true - launcher.Spec.Suspend = pointer.Bool(true) - } else if !isMPIJobSuspended(mpiJob) && isJobSuspended(launcher) { - launcherSuspendUpdate = true - // unsuspend the launcher first if the MPI job is unsuspended - launcher.Spec.Suspend = pointer.Bool(false) - } - if launcherSuspendUpdate { + if isMPIJobSuspended(mpiJob) != isJobSuspended(launcher) { + // align the suspension state of launcher with the MPIJob + launcher.Spec.Suspend = pointer.Bool(isMPIJobSuspended(mpiJob)) if _, err := c.kubeClient.BatchV1().Jobs(namespace).Update(context.TODO(), launcher, metav1.UpdateOptions{}); err != nil { return err } @@ -597,14 +588,14 @@ func (c *MPIJobController) syncHandler(key string) error { // cleanup the running worker pods if the MPI job is suspended if isMPIJobSuspended(mpiJob) { - if err := cleanUpPods(mpiJob, c); err != nil { + if err := cleanUpWorkerPods(mpiJob, c); err != nil { return err } } return nil } -func cleanUpPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error { +func cleanUpWorkerPods(mpiJob *kubeflow.MPIJob, c *MPIJobController) error { // set worker StatefulSet Replicas to 0. if err := c.deleteWorkerPods(mpiJob); err != nil { return err diff --git a/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md b/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md index 64474a3a..5d305783 100644 --- a/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md +++ b/sdk/python/v2beta1/docs/V2beta1MPIJobSpec.md @@ -9,7 +9,7 @@ Name | Type | Description | Notes **run_policy** | [**V1RunPolicy**](V1RunPolicy.md) | | [optional] **slots_per_worker** | **int** | Specifies the number of slots per worker used in hostfile. Defaults to 1. | [optional] **ssh_auth_mount_path** | **str** | SSHAuthMountPath is the directory where SSH keys are mounted. Defaults to \"/root/.ssh\". | [optional] -**suspend** | **bool** | suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job, effectively resetting the ActiveDeadlineSeconds timer too. Defaults to false. Defaults to false. | [optional] +**suspend** | **bool** | suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds timer too. Defaults to false. Defaults to false. | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py b/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py index a5f52181..b146cb96 100644 --- a/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py +++ b/sdk/python/v2beta1/mpijob/models/v2beta1_mpi_job_spec.py @@ -195,7 +195,7 @@ def ssh_auth_mount_path(self, ssh_auth_mount_path): def suspend(self): """Gets the suspend of this V2beta1MPIJobSpec. # noqa: E501 - suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job, effectively resetting the ActiveDeadlineSeconds timer too. Defaults to false. Defaults to false. # noqa: E501 + suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds timer too. Defaults to false. Defaults to false. # noqa: E501 :return: The suspend of this V2beta1MPIJobSpec. # noqa: E501 :rtype: bool @@ -206,7 +206,7 @@ def suspend(self): def suspend(self, suspend): """Sets the suspend of this V2beta1MPIJobSpec. - suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job, effectively resetting the ActiveDeadlineSeconds timer too. Defaults to false. Defaults to false. # noqa: E501 + suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob, effectively resetting the activeDeadlineSeconds timer too. Defaults to false. Defaults to false. # noqa: E501 :param suspend: The suspend of this V2beta1MPIJobSpec. # noqa: E501 :type suspend: bool