Skip to content

Commit

Permalink
Add the support to replace evicted head pod (#381)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeffwan authored Jul 15, 2022
1 parent 022eed1 commit 9de2fb0
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 1 deletion.
6 changes: 6 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,12 @@ func (r *RayClusterReconciler) reconcilePods(instance *rayiov1alpha1.RayCluster)
log.Info("reconcilePods ", "head pod found", headPod.Name)
if headPod.Status.Phase == corev1.PodRunning || headPod.Status.Phase == corev1.PodPending {
log.Info("reconcilePods", "head pod is up and running... checking workers", headPod.Name)
} else if headPod.Status.Phase == corev1.PodFailed && strings.Contains(headPod.Status.Reason, "Evicted") {
// Handle evicted pod
log.Info("reconcilePods", "head pod has been evicted and controller needs to replace the pod", headPod.Name)
if err := r.Delete(context.TODO(), &headPod); err != nil {
return err
}
} else {
return fmt.Errorf("head pod %s is not running nor pending", headPod.Name)
}
Expand Down
48 changes: 47 additions & 1 deletion ray-operator/controllers/ray/raycluster_controller_fake_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ var (
expectReplicaNum int32
testPods []runtime.Object
testRayCluster *rayiov1alpha1.RayCluster
headSelector labels.Selector
workerSelector labels.Selector
workersToDelete []string
)
Expand Down Expand Up @@ -309,7 +310,13 @@ func setupTest(t *testing.T) {
selection.Equals,
groupNameReqValue)
assert.Nil(t, err, "Fail to create requirement")

headNameReqValue := []string{headGroupNameStr}
headNameReq, err := labels.NewRequirement(
common.RayNodeGroupLabelKey,
selection.Equals,
headNameReqValue)
assert.Nil(t, err, "Fail to create requirement")
headSelector = labels.NewSelector().Add(*headNameReq)
workerSelector = labels.NewSelector().Add(*instanceReq).Add(*groupNameReq)
}

Expand Down Expand Up @@ -572,6 +579,45 @@ func TestReconcile_PodDCrash_DiffLess0_OK(t *testing.T) {
}
}

func TestReconcile_PodEvicted_DiffLess0_OK(t *testing.T) {
setupTest(t)
defer tearDown(t)

fakeClient := clientFake.NewClientBuilder().WithRuntimeObjects(testPods...).Build()

podList := corev1.PodList{}
err := fakeClient.List(context.Background(), &podList, client.InNamespace(namespaceStr))

assert.Nil(t, err, "Fail to get pod list")
assert.Equal(t, len(testPods), len(podList.Items), "Init pod list len is wrong")

// Simulate head pod get evicted.
podList.Items[0].Status.Phase = corev1.PodFailed
podList.Items[0].Status.Reason = "Evicted"
err = fakeClient.Update(context.Background(), &podList.Items[0])
assert.Nil(t, err, "Fail to get update pod status")

testRayClusterReconciler := &RayClusterReconciler{
Client: fakeClient,
Recorder: &record.FakeRecorder{},
Scheme: scheme.Scheme,
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
}

err = testRayClusterReconciler.reconcilePods(testRayCluster)
assert.Nil(t, err, "Fail to reconcile Pods")

// Filter head pod
err = fakeClient.List(context.Background(), &podList, &client.ListOptions{
LabelSelector: headSelector,
Namespace: namespaceStr,
})

assert.Nil(t, err, "Fail to get pod list after reconcile")
assert.Equal(t, 0, len(podList.Items),
"Evicted head should be deleted after reconcile expect %d actual %d", 0, len(podList.Items))
}

func TestReconcile_UpdateLocalWorkersToDelete_OK(t *testing.T) {
setupTest(t)
defer tearDown(t)
Expand Down

0 comments on commit 9de2fb0

Please sign in to comment.