Skip to content

Commit

Permalink
[RayService] Revisit the conditions under which a RayService is consi…
Browse files Browse the repository at this point in the history
…dered unhealthy and the default threshold (ray-project#1293)

Revisit the conditions under which a RayService is considered unhealthy and the default threshold
  • Loading branch information
kevin85421 authored and blublinsky committed Aug 29, 2023
1 parent 10dc57e commit 029dd7e
Showing 1 changed file with 5 additions and 15 deletions.
20 changes: 5 additions & 15 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -734,8 +734,8 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer
//
// (1) `isHealthy` is used to determine whether restart the RayCluster or not.
// (2) `isReady` is used to determine whether the Serve applications in the RayCluster are ready to serve incoming traffic or not.
// (3) `err`: If `err` is not nil, it means that KubeRay failed to get Serve application statuses from the dashboard agent. We should take a look at dashboard agent rather than Ray Serve applications.

// (3) `err`: If `err` is not nil, it means that KubeRay failed to get Serve application statuses from the dashboard agent. We should take a
// look at dashboard agent rather than Ray Serve applications.
func (r *RayServiceReconciler) getAndCheckServeStatus(ctx context.Context, dashboardClient utils.RayDashboardClientInterface, rayServiceServeStatus *rayv1alpha1.RayServiceStatus, serveConfigType utils.RayServeConfigType, unhealthySecondThreshold *int32) (bool, bool, error) {
// If the `unhealthySecondThreshold` value is non-nil, then we will use that value. Otherwise, we will use the value ServiceUnhealthySecondThreshold
// which can be set in a test. This is used for testing purposes.
Expand Down Expand Up @@ -1162,24 +1162,14 @@ func (r *RayServiceReconciler) labelHealthyServePods(ctx context.Context, rayClu
if pod.Labels == nil {
pod.Labels = make(map[string]string)
}

// Make a copy of the labels for comparison later, to decide whether we need to push an update.
originalLabels := make(map[string]string, len(pod.Labels))
for key, value := range pod.Labels {
originalLabels[key] = value
}

if httpProxyClient.CheckHealth() == nil {
pod.Labels[common.RayClusterServingServiceLabelKey] = common.EnableRayClusterServingServiceTrue
} else {
pod.Labels[common.RayClusterServingServiceLabelKey] = common.EnableRayClusterServingServiceFalse
}

if !reflect.DeepEqual(originalLabels, pod.Labels) {
if updateErr := r.Update(ctx, &pod); updateErr != nil {
r.Log.Error(updateErr, "Pod label Update error!", "Pod.Error", updateErr)
return updateErr
}
if updateErr := r.Update(ctx, &pod); updateErr != nil {
r.Log.Error(updateErr, "Pod label Update error!", "Pod.Error", updateErr)
return updateErr
}
}

Expand Down

0 comments on commit 029dd7e

Please sign in to comment.