Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix live-updates for kubernetes Job getting stuck occasionally #3685

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/controller/admissionchecks/multikueue/workload_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func TestWlReconcile(t *testing.T) {
}

baseWorkloadBuilder := utiltesting.MakeWorkload("wl1", TestNamespace)
baseJobBuilder := testingjob.MakeJob("job1", TestNamespace)
baseJobBuilder := testingjob.MakeJob("job1", TestNamespace).Suspend(false)
baseJobManagedByKueueBuilder := baseJobBuilder.Clone().ManagedBy(kueue.MultiKueueControllerName)

cases := map[string]struct {
Expand Down
9 changes: 9 additions & 0 deletions pkg/controller/jobs/job/job_multikueue_adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
Expand All @@ -43,6 +44,8 @@ type multikueueAdapter struct{}
var _ jobframework.MultiKueueAdapter = (*multikueueAdapter)(nil)

func (b *multikueueAdapter) SyncJob(ctx context.Context, localClient client.Client, remoteClient client.Client, key types.NamespacedName, workloadName, origin string) error {
log := ctrl.LoggerFrom(ctx)

localJob := batchv1.Job{}
err := localClient.Get(ctx, key, &localJob)
if err != nil {
Expand All @@ -57,6 +60,12 @@ func (b *multikueueAdapter) SyncJob(ctx context.Context, localClient client.Clie

// the remote job exists
if err == nil {
if fromObject(&localJob).IsSuspended() {
// Ensure the job is unsuspended before updating its status; otherwise, it will fail when patching the spec.
log.V(2).Info("Skipping the sync since the local job is still suspended")
return nil
}

if features.Enabled(features.MultiKueueBatchJobWithManagedBy) {
return clientutil.PatchStatus(ctx, localClient, &localJob, func() (bool, error) {
localJob.Status = remoteJob.Status
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/jobs/job/job_multikueue_adapter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func TestMultikueueAdapter(t *testing.T) {
cmpopts.EquateEmpty(),
}

baseJobBuilder := utiltestingjob.MakeJob("job1", TestNamespace)
baseJobBuilder := utiltestingjob.MakeJob("job1", TestNamespace).Suspend(false)
baseJobManagedByKueueBuilder := baseJobBuilder.Clone().ManagedBy(kueue.MultiKueueControllerName)

cases := map[string]struct {
Expand Down