From 07c5adb46fee335fe39f6b8dcfe38a9685cfe4d1 Mon Sep 17 00:00:00 2001 From: TommyLike Date: Fri, 26 Jul 2019 11:45:00 +0800 Subject: [PATCH] Fix race condition issue --- Makefile | 2 +- pkg/controllers/job/job_controller_actions.go | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4cf0fc8e913..1ac867a33e6 100644 --- a/Makefile +++ b/Makefile @@ -68,7 +68,7 @@ generate-code: ./hack/update-gencode.sh unit-test: - go list ./... | grep -v e2e | xargs go test -v -cover -covermode atomic -coverprofile coverage.txt + go list ./... | grep -v e2e | xargs go test -v -cover -covermode atomic -coverprofile coverage.txt -race e2e-test-kind: ./hack/run-e2e-kind.sh diff --git a/pkg/controllers/job/job_controller_actions.go b/pkg/controllers/job/job_controller_actions.go index 8d4b1569582..1b600c4044f 100644 --- a/pkg/controllers/job/job_controller_actions.go +++ b/pkg/controllers/job/job_controller_actions.go @@ -253,6 +253,7 @@ func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateSt waitCreationGroup := sync.WaitGroup{} waitCreationGroup.Add(len(podToCreate)) + stateMutex := sync.Mutex{} for _, pod := range podToCreate { go func(pod *v1.Pod) { defer waitCreationGroup.Done() @@ -263,13 +264,17 @@ func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateSt // So gang-scheduling could schedule the Job successfully glog.Errorf("Failed to create pod %s for Job %s, err %#v", pod.Name, job.Name, err) + stateMutex.Lock() creationErrs = append(creationErrs, fmt.Errorf("failed to create pod %s, err: %#v", pod.Name, err)) + stateMutex.Unlock() } else { if err != nil && apierrors.IsAlreadyExists(err) { cc.resyncTask(pod) } + stateMutex.Lock() classifyAndAddUpPodBaseOnPhase(newPod, &pending, &running, &succeeded, &failed, &unknown) + stateMutex.Unlock() glog.V(3).Infof("Created Task <%s> of Job <%s/%s>", pod.Name, job.Namespace, job.Name) } @@ -297,12 +302,16 @@ func (cc *Controller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateSt // So gang-scheduling could schedule the Job successfully glog.Errorf("Failed to delete pod %s for Job %s, err %#v", pod.Name, job.Name, err) + stateMutex.Lock() deletionErrs = append(deletionErrs, err) + stateMutex.Unlock() cc.resyncTask(pod) } else { glog.V(3).Infof("Deleted Task <%s> of Job <%s/%s>", pod.Name, job.Namespace, job.Name) + stateMutex.Lock() terminating++ + stateMutex.Unlock() } }(pod) }