Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RayService] Revisit the conditions under which a RayService is considered unhealthy and the default threshold #1293

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions ray-operator/apis/ray/v1alpha1/rayservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,21 @@ const (
)

// These statuses should match Ray Serve's application statuses
// See `enum ApplicationStatus` in https://sourcegraph.com/github.com/ray-project/ray/-/blob/src/ray/protobuf/serve.proto for more details.
var ApplicationStatusEnum = struct {
NOT_STARTED string
DEPLOYING string
RUNNING string
DEPLOY_FAILED string
DELETING string
UNHEALTHY string
}{
NOT_STARTED: "NOT_STARTED",
DEPLOYING: "DEPLOYING",
RUNNING: "RUNNING",
DEPLOY_FAILED: "DEPLOY_FAILED",
DELETING: "DELETING",
UNHEALTHY: "UNHEALTHY",
}

// These statuses should match Ray Serve's deployment statuses
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/samples/ray-service.autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ kind: RayService
metadata:
name: rayservice-sample
spec:
serviceUnhealthySecondThreshold: 300 # Config for the health check threshold for service. Default value is 60.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for deployments. Default value is 60.
serviceUnhealthySecondThreshold: 900 # Config for the health check threshold for Ray Serve applications. Default value is 900.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for Ray dashboard agent. Default value is 300.
# The workload consists of two applications. The first application checks on an event in the second application.
# If the event isn't set, the first application will block on requests until the event is set. So, to test upscaling
# we can first send a bunch of requests to the first application, which will trigger Serve autoscaling to bring up
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ kind: RayService
metadata:
name: rayservice-sample
spec:
serviceUnhealthySecondThreshold: 300 # Config for the health check threshold for service. Default value is 60.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for deployments. Default value is 60.
serviceUnhealthySecondThreshold: 900 # Config for the health check threshold for Ray Serve applications. Default value is 900.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for Ray dashboard agent. Default value is 300.
serveService:
metadata:
name: custom-ray-serve-service-name
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/samples/ray-service.different-port.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ kind: RayService
metadata:
name: rayservice-sample
spec:
serviceUnhealthySecondThreshold: 300 # Config for the health check threshold for service. Default value is 60.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for deployments. Default value is 60.
serviceUnhealthySecondThreshold: 900 # Config for the health check threshold for Ray Serve applications. Default value is 900.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for Ray dashboard agent. Default value is 300.
serveConfig:
importPath: fruit.deployment_graph
runtimeEnv: |
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/samples/ray-service.mobilenet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ kind: RayService
metadata:
name: rayservice-mobilenet
spec:
serviceUnhealthySecondThreshold: 300 # Config for the health check threshold for service. Default value is 60.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for deployments. Default value is 60.
serviceUnhealthySecondThreshold: 900 # Config for the health check threshold for Ray Serve applications. Default value is 900.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for Ray dashboard agent. Default value is 300.
serveConfigV2: |
applications:
- name: mobilenet
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/samples/ray-service.stable-diffusion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ kind: RayService
metadata:
name: stable-diffusion
spec:
serviceUnhealthySecondThreshold: 300 # Config for the health check threshold for service. Default value is 60.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for deployments. Default value is 60.
serviceUnhealthySecondThreshold: 900 # Config for the health check threshold for Ray Serve applications. Default value is 900.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for Ray dashboard agent. Default value is 300.
serveConfigV2: |
applications:
- name: stable_diffusion
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/samples/ray_v1alpha1_rayservice.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ kind: RayService
metadata:
name: rayservice-sample
spec:
serviceUnhealthySecondThreshold: 300 # Config for the health check threshold for service. Default value is 60.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for deployments. Default value is 60.
serviceUnhealthySecondThreshold: 900 # Config for the health check threshold for Ray Serve applications. Default value is 900.
deploymentUnhealthySecondThreshold: 300 # Config for the health check threshold for Ray dashboard agent. Default value is 300.
# serveConfigV2 takes a yaml multi-line scalar, which should be a Ray Serve multi-application config. See https://docs.ray.io/en/latest/serve/multi-app.html.
# Only one of serveConfig and serveConfigV2 should be used.
serveConfigV2: |
Expand Down
52 changes: 30 additions & 22 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ import (

// This variable is mutable for unit testing purpose.
var (
ServiceUnhealthySecondThreshold = 60.0 // Serve deployment related health check.
ServiceUnhealthySecondThreshold = 900.0 // Serve deployment related health check.
)

const (
ServiceDefaultRequeueDuration = 2 * time.Second
ServiceRestartRequeueDuration = 10 * time.Second
RayClusterDeletionDelayDuration = 60 * time.Second
DeploymentUnhealthySecondThreshold = 60.0 // Dashboard agent related health check.
DeploymentUnhealthySecondThreshold = 300.0 // Dashboard agent related health check.
)

// RayServiceReconciler reconciles a RayService object
Expand Down Expand Up @@ -730,9 +730,12 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer
// `getAndCheckServeStatus` gets Serve applications' and deployments' statuses, updates health timestamps,
// and checks if the RayCluster is overall healthy. It takes as one of its inputs `serveConfigType`, which
// is used to decide whether to query the single-application Serve REST API or the multi-application Serve
// REST API. It's return values should be interpreted as:
// REST API. It's return values should be interpreted as: (isHealthy, isReady, err).
//
// (Serve app healthy?, Serve app ready?, error if failed to get Serve statuses)
// (1) `isHealthy` is used to determine whether restart the RayCluster or not.
// (2) `isReady` is used to determine whether the Serve applications in the RayCluster are ready to serve incoming traffic or not.
// (3) `err`: If `err` is not nil, it means that KubeRay failed to get Serve application statuses from the dashboard agent. We should take a
// look at dashboard agent rather than Ray Serve applications.
func (r *RayServiceReconciler) getAndCheckServeStatus(ctx context.Context, dashboardClient utils.RayDashboardClientInterface, rayServiceServeStatus *rayv1alpha1.RayServiceStatus, serveConfigType utils.RayServeConfigType, unhealthySecondThreshold *int32) (bool, bool, error) {
// If the `unhealthySecondThreshold` value is non-nil, then we will use that value. Otherwise, we will use the value ServiceUnhealthySecondThreshold
// which can be set in a test. This is used for testing purposes.
Expand Down Expand Up @@ -789,26 +792,32 @@ func (r *RayServiceReconciler) getAndCheckServeStatus(ctx context.Context, dashb
Deployments: make(map[string]rayv1alpha1.ServeDeploymentStatus),
}

// Check app status
if app.Status != rayv1alpha1.ApplicationStatusEnum.RUNNING {
// Check previous app status
if prevApplicationStatus.Status != rayv1alpha1.ApplicationStatusEnum.RUNNING {
// `isHealthy` is used to determine whether restart the RayCluster or not. If the serve application is `UNHEALTHY` or `DEPLOY_FAILED`
// for more than `serviceUnhealthySecondThreshold` seconds, then KubeRay will consider the RayCluster unhealthy and prepare a new RayCluster.
if isServeAppUnhealthyOrDeployedFailed(app.Status) {
if isServeAppUnhealthyOrDeployedFailed(prevApplicationStatus.Status) {
if prevApplicationStatus.HealthLastUpdateTime != nil {
applicationStatus.HealthLastUpdateTime = prevApplicationStatus.HealthLastUpdateTime
if time.Since(prevApplicationStatus.HealthLastUpdateTime.Time).Seconds() > serviceUnhealthySecondThreshold {
r.Log.Info("Restart RayCluster", "appName", appName, "restart reason",
fmt.Sprintf(
"The status of the serve application %s has not been RUNNING for more than %f seconds. "+
"The status of the serve application %s has been UNHEALTHY or DEPLOY_FAILED for more than %f seconds. "+
"Hence, KubeRay operator labels the RayCluster unhealthy and will prepare a new RayCluster. ",
appName, serviceUnhealthySecondThreshold))
isHealthy = false
}
}
}
}

// `isReady` is used to determine whether the Serve application is ready or not. The cluster switchover only happens when all Serve
// applications in this RayCluster are ready so that the incoming traffic will not be dropped. Note that if `isHealthy` is false,
// then `isReady` must be false as well.
if app.Status != rayv1alpha1.ApplicationStatusEnum.RUNNING {
isReady = false
}

// Check deployment statuses
// Copy deployment statuses
for deploymentName, deployment := range app.Deployments {
deploymentStatus := rayv1alpha1.ServeDeploymentStatus{
Status: deployment.Status,
Expand All @@ -817,26 +826,21 @@ func (r *RayServiceReconciler) getAndCheckServeStatus(ctx context.Context, dashb
HealthLastUpdateTime: &timeNow,
}

if deployment.Status != rayv1alpha1.DeploymentStatusEnum.HEALTHY {
if deployment.Status == rayv1alpha1.DeploymentStatusEnum.UNHEALTHY {
prevStatus, exist := prevApplicationStatus.Deployments[deploymentName]
if exist {
if prevStatus.Status != rayv1alpha1.DeploymentStatusEnum.HEALTHY {
if prevStatus.Status == rayv1alpha1.DeploymentStatusEnum.UNHEALTHY {
deploymentStatus.HealthLastUpdateTime = prevStatus.HealthLastUpdateTime

if !isHealthy || (prevStatus.HealthLastUpdateTime != nil && time.Since(prevStatus.HealthLastUpdateTime.Time).Seconds() > serviceUnhealthySecondThreshold) {
// TODO (kevin85421): Without `!isHealthy`, this `if` statement is almost impossible to be reached because the `HealthLastUpdateTime` of a serve deployment
// is always later than the `HealthLastUpdateTime` of the serve application. Hence, the restart is always triggered by the serve application. If we
// can confirm that `isHealthy = false` is always set by the serve application check, we can remove the `time.Since` check here.
if !isHealthy {
r.Log.Info("Restart RayCluster", "deploymentName", deploymentName, "appName", appName, "restart reason",
fmt.Sprintf(
"The status of the serve deployment %s or the serve application %s has not been HEALTHY/RUNNING for more than %f seconds. "+
"The serve application %s has been UNHEALTHY or DEPLOY_FAILED for more than %f seconds. "+
"This may be caused by the serve deployment %s is UNHEALTHY. "+
kevin85421 marked this conversation as resolved.
Show resolved Hide resolved
"Hence, KubeRay operator labels the RayCluster unhealthy and will prepare a new RayCluster. "+
"The message of the serve deployment is: %s", deploymentName, appName, serviceUnhealthySecondThreshold, deploymentStatus.Message))
isHealthy = false
"The message of the serve deployment is: %s", appName, serviceUnhealthySecondThreshold, deploymentName, deploymentStatus.Message))
}
}
}
isReady = false
}
applicationStatus.Deployments[deploymentName] = deploymentStatus
}
Expand Down Expand Up @@ -1090,7 +1094,7 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
}

var isHealthy, isReady bool
if isHealthy, isReady, err = r.getAndCheckServeStatus(ctx, rayDashboardClient, rayServiceStatus, r.determineServeConfigType(rayServiceInstance), rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold); err != nil {
if isHealthy, isReady, err = r.getAndCheckServeStatus(ctx, rayDashboardClient, rayServiceStatus, r.determineServeConfigType(rayServiceInstance), rayServiceInstance.Spec.ServiceUnhealthySecondThreshold); err != nil {
if !r.updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold) {
logger.Info("Dashboard is unhealthy, restart the cluster.")
r.markRestart(rayServiceInstance)
Expand Down Expand Up @@ -1214,3 +1218,7 @@ func (r *RayServiceReconciler) isHeadPodRunningAndReady(ctx context.Context, ins

return utils.IsRunningAndReady(&podList.Items[0]), nil
}

func isServeAppUnhealthyOrDeployedFailed(appStatus string) bool {
return appStatus == rayv1alpha1.ApplicationStatusEnum.UNHEALTHY || appStatus == rayv1alpha1.ApplicationStatusEnum.DEPLOY_FAILED
}
Loading