ray-project · DmitriGekhtman · Jul 28, 2022 · Aug 10, 2022 · Aug 10, 2022 · Aug 11, 2022
diff --git a/.github/workflows/test-job.yaml b/.github/workflows/test-job.yaml
@@ -4,8 +4,11 @@ on:
   push:
     branches:
     - master
+    - release-0.3
   pull_request:
-    branches: [ master ]
+    branches: 
+    - master
+    - release-0.3
 
 jobs:
   lint:
@@ -167,16 +170,16 @@ jobs:
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-        if: ${{ github.ref == 'refs/heads/master' }}
+        if: contains(fromJson('["refs/heads/master", "refs/heads/release-0.3"]'), github.ref)
 
       - name: Push Apiserver to DockerHub
         run: |
           docker push kuberay/apiserver:${{ steps.vars.outputs.sha_short }};
-          docker image tag kuberay/apiserver:${{ steps.vars.outputs.sha_short }} kuberay/apiserver:nightly;
-          docker push kuberay/apiserver:nightly
+          docker image tag kuberay/apiserver:${{ steps.vars.outputs.sha_short }} kuberay/apiserver:release-0.3;
+          docker push kuberay/apiserver:release-0.3
 
         working-directory: ${{env.working-directory}}
-        if: ${{ github.ref == 'refs/heads/master' }}
+        if: contains(fromJson('["refs/heads/master", "refs/heads/release-0.3"]'), github.ref)
 
       - name: Build CLI
         run: go build -o kuberay -a main.go
@@ -249,16 +252,16 @@ jobs:
       with:
         username: ${{ secrets.DOCKER_USERNAME }}
         password: ${{ secrets.DOCKER_PASSWORD }}
-      if: ${{ github.ref == 'refs/heads/master' }}
+      if: contains(fromJson('["refs/heads/master", "refs/heads/release-0.3"]'), github.ref)
 
     - name: Push Operator to DockerHub
       run: |
         docker push kuberay/operator:${{ steps.vars.outputs.sha_short }};
-        docker image tag kuberay/operator:${{ steps.vars.outputs.sha_short }} kuberay/operator:nightly;
-        docker push kuberay/operator:nightly
+        docker image tag kuberay/operator:${{ steps.vars.outputs.sha_short }} kuberay/operator:release-0.3;
+        docker push kuberay/operator:release-0.3
 
       working-directory: ${{env.working-directory}}
-      if: ${{ github.ref == 'refs/heads/master' }}
+      if: contains(fromJson('["refs/heads/master", "refs/heads/release-0.3"]'), github.ref)
 
   test-compatibility-1_11_0:
     needs:

diff --git a/apiserver/deploy/base/kustomization.yaml b/apiserver/deploy/base/kustomization.yaml
@@ -9,3 +9,8 @@ resources:
 commonLabels:
   app.kubernetes.io/name: kuberay
   app.kubernetes.io/component: kuberay-apiserver
+
+images:
+- name: kuberay/apiserver
+  newName: kuberay/apiserver
+  newTag: v0.3.0
diff --git a/docs/best-practice/worker-head-reconnection.md b/docs/best-practice/worker-head-reconnection.md
@@ -22,12 +22,12 @@ It retries 600 times and each interval is 1s, resulting in total 600s timeout, i
 
 ## Best Practice
 
-GCS HA feature [#20498](https://github.com/ray-project/ray/issues/20498) is planned in Ray Core Roadmap. When this feature is released, expect a stable head and GCS such that worker-head connection lost issue will not appear anymore. 
+GCS FT feature [#20498](https://github.com/ray-project/ray/issues/20498) is planned in Ray Core Roadmap. When this feature is released, expect a stable head and GCS such that worker-head connection lost issue will not appear anymore. 
 
 Before that, to solve the workers-head connection lost, there are two options:
 
 - Make head more stable: when creating the cluster, allocate sufficient amount of resources on head pod such that it tends to be stable and not easy to crash. You can also set {"num-cpus": "0"} in "rayStartParams" of "headGroupSpec" such that Ray scheduler will skip the head node when scheduling workloads. This also helps to maintain the stability of the head. 
 
 - Make reconnection shorter: for version <= 1.9.1, you can set this head param --system-config='{"ping_gcs_rpc_server_max_retries": 20}' to reduce the delay from 600s down to 20s before workers reconnect to the new head. 
 
-> Note: we should update this doc when GCS HA feature gets updated.
+> Note: we should update this doc when GCS FT feature gets updated.
diff --git a/docs/guidance/gcs-ha.md → docs/guidance/gcs-ft.md b/docs/guidance/gcs-ha.md → docs/guidance/gcs-ft.md
@@ -1,32 +1,32 @@
-## Ray GCS HA (Experimental)
+## Ray GCS Fault Tolerance(GCS FT) (Experimental)
 
 > Note: This feature is still experimental, there are a few limitations and stabilization will be done in future release from both Ray and KubeRay side.
 
-Ray GCS HA enables GCS server to use external storage backend. As a result, Ray clusters can tolerant GCS failures and recover from failures
+Ray GCS FT enables GCS server to use external storage backend. As a result, Ray clusters can tolerant GCS failures and recover from failures
 without affecting important services such as detached Actors & RayServe deployments.
 
 ### Prerequisite
 
 * Ray 2.0 is required.
 * You need to support external Redis server for Ray. (Redis HA cluster is highly recommended.)
 
-### Enable Ray GCS HA
+### Enable Ray GCS FT
 
-To enable Ray GCS HA in your newly KubeRay-managed Ray cluster, you need to enable it by adding an annotation to the
+To enable Ray GCS FT in your newly KubeRay-managed Ray cluster, you need to enable it by adding an annotation to the
 RayCluster YAML file.
 
 ```yaml
 ...
 kind: RayCluster
 metadata:
   annotations:
-    ray.io/ha-enabled: "true" # <- add this annotation enable GCS HA
+    ray.io/ft-enabled: "true" # <- add this annotation enable GCS FT
     ray.io/external-storage-namespace: "my-raycluster-storage-namespace" # <- optional, to specify the external storage namespace
 ...
 ```
 An example can be found at [ray-cluster.external-redis.yaml](https://github.com/ray-project/kuberay/blob/master/ray-operator/config/samples/ray-cluster.external-redis.yaml)
 
-When annotation `ray.io/ha-enabled` is added with a `true` value, KubeRay will enable Ray GCS HA feature. This feature
+When annotation `ray.io/ft-enabled` is added with a `true` value, KubeRay will enable Ray GCS FT feature. This feature
 contains several components:
 
 1. Newly created Ray cluster has `Readiness Probe` and `Liveness Probe` added to all the head/worker nodes.
@@ -37,7 +37,7 @@ contains several components:
 
 #### Readiness Probe vs Liveness Probe
 
-These are the two types of probes we used in Ray GCS HA. 
+These are the two types of probes we used in Ray GCS FT. 
 
 The readiness probe is used to notify KubeRay in case of failures in the corresponding Ray cluster. KubeRay can try its best to
 recover the Ray cluster. If KubeRay cannot recover the failed head/worker node, the liveness probe gets in, delete the old pod
@@ -53,14 +53,14 @@ On Ray head node, we access a local Ray dashboard http endpoint and a Raylet htt
 healthy state. Since Ray dashboard does not reside Ray worker node, we only check the local Raylet http endpoint to make sure
 the worker node is healthy.
 
-#### Ray GCS HA Annotation
+#### Ray GCS FT Annotation
 
-Our Ray GCS HA feature checks if an annotation called `ray.io/ha-enabled` is set to `true` in `RayCluster` YAML file. If so, KubeRay
+Our Ray GCS FT feature checks if an annotation called `ray.io/ft-enabled` is set to `true` in `RayCluster` YAML file. If so, KubeRay
 will also add such annotation to the pod whenever the head/worker node is created.
 
 #### Use External Redis Cluster
 
-To use external Redis cluster as the backend storage(required by Ray GCS HA),
+To use external Redis cluster as the backend storage(required by Ray GCS FT),
 you need to add `RAY_REDIS_ADDRESS` environment variable to the head node template.
 
 Also, you can specify a storage namespace for your Ray cluster by using an annotation `ray.io/external-storage-namespace`
@@ -70,8 +70,8 @@ An example can be found at [ray-cluster.external-redis.yaml](https://github.com/
 #### KubeRay Operator Controller
 
 KubeRay Operator controller watches for new `Event` reconcile call. If this Event object is to notify the failed readiness probe,
-controller checks if this pod has `ray.io/ha-enabled` set to `true`. If this pod has this annotation set to true, that means this pod
-belongs to a Ray cluster that has Ray GCS HA enabled.
+controller checks if this pod has `ray.io/ft-enabled` set to `true`. If this pod has this annotation set to true, that means this pod
+belongs to a Ray cluster that has Ray GCS FT enabled.
 
 After this, the controller will try to recover the failed pod. If controller cannot recover it, an annotation named 
 `ray.io/health-state` with a value `Unhealthy` is added to this pod.
@@ -82,7 +82,7 @@ In every KubeRay Operator controller reconcile loop, it monitors any pod in Ray
 #### External Storage Namespace
 
 External storage namespaces can be used to share a single storage backend among multiple Ray clusters. By default, `ray.io/external-storage-namespace`
-uses the RayCluster UID as its value when GCS HA is enabled. Or if the user wants to use customized external storage namespace,
+uses the RayCluster UID as its value when GCS FT is enabled. Or if the user wants to use customized external storage namespace,
 the user can add `ray.io/external-storage-namespace` annotation to RayCluster yaml file.
 
 Whenever `ray.io/external-storage-namespace` annotation is set, the head/worker node will have `RAY_external_storage_namespace` environment
@@ -92,9 +92,9 @@ variable set which Ray can pick up later.
 
 1. For now, Ray head/worker node that fails the readiness probe recovers itself by restarting itself. More fine-grained control and recovery mechanisms are expected in the future.
 
-### Test Ray GCS HA
+### Test Ray GCS FT
 
-Currently, two tests are responsible for ensuring Ray GCS HA is working correctly.
+Currently, two tests are responsible for ensuring Ray GCS FT is working correctly.
 
 1. Detached actor test
 2. RayServe test

diff --git a/docs/guidance/observability.md b/docs/guidance/observability.md
@@ -1,6 +1,55 @@
 # Observability
 
-### Monitor
+## RayCluster Status
+
+### State
+In the RayCluster resource definition, we use `State` to represent the current status of the ray cluster.
+
+For now, there are three types of the status exposed by the resouce status: `ready`, `unhealthy` and `failed`.
+| State     | Description                                                                                     |
+| --------- | ----------------------------------------------------------------------------------------------- |
+| ready     | the ray cluster is ready for use.                                                               |
+| unhealthy | there is something miss configed in the `startParams` and the ray cluster may not act correctly |
+| failed    | there are some severe fatal and result in the head node or worker node start failed.            |
+
+If you use apiserver to retrieve the resource, you may find the state in the `clusterState` field.
+
+```json
+curl --request GET '<baseUrl>/apis/v1alpha2/namespaces/<namespace>/clusters/<raycluster-name>'
+{
+    "name": "<raycluster-name>",
+    "namespace": "<namespace>",
+    //...
+    "createdAt": "2022-08-10T10:31:25Z",
+    "clusterState": "ready",
+    //...
+}
+```
+
+### Endpoint
+If you use the nodeport as service to expose the raycluster endpoint, like dashboard or redis, there are `endpoints` field in the status to record the service endpoints.
+
+you can directly use the ports in the `endpoints` to connect to the related service.
+
+Also, if you use apiserver to retrieve the resource, you can find the endpoints in the `serviceEndpoint` field.
+
+```json
+curl --request GET '<baseUrl>/apis/v1alpha2/namespaces/<namespace>/clusters/<raycluster-name>'
+{
+    "name": "<raycluster-name>",
+    "namespace": "<namespace>",
+    //...
+    "serviceEndpoint": {
+        "dashboard": "30001",
+        "head": "30002",
+        "metrics": "30003",
+        "redis": "30004"
+    },
+    //...
+}
+```
+
+## Monitor
 
 We have added a parameter `--metrics-expose-port=8080` to open the port and expose metrics both for the ray cluster and our control plane. We also leverage the [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) to start the whole monitoring system.
 

diff --git a/docs/guidance/rayservice.md b/docs/guidance/rayservice.md
@@ -130,7 +130,7 @@ Then you can open your web browser with the url localhost:8265 to see your Ray d
 
 ### Update Ray Serve Deployment Graph
 
-You can update the `serveDeploymentGraphConfig` in your RayService config file.
+You can update the `serveConfig` in your RayService config file.
 For example, if you update the mango price to 4 in [ray_v1alpha1_rayservice.yaml](https://github.com/ray-project/kuberay/blob/master/ray-operator/config/samples/ray_v1alpha1_rayservice.yaml).
 ```shell
   - name: MangoStand
@@ -191,12 +191,39 @@ You can see RayService is preparing a pending cluster. After the pending cluster
 You can use `kubectl logs` to check the operator logs or the head/worker nodes logs.
 You can also use `kubectl describe rayservices rayservice-sample` to check the states and event logs of your RayService instance.
 
-You can also login the head pod and use Ray cli to check the logs.
+For ray serve monitoring, you can refer to the [Ray observability documentation](https://docs.ray.io/en/master/ray-observability/state/state-api.html).
+To run Ray state APIs, you can log in to the head pod and use the Ray CLI.
 `kubectl exec -it <head-node-pod> bash`
+Or you can run the command locally:
+`kubectl exec -it <head-node-pod> -- <ray state api>`
+For example:
+`kubectl exec -it <head-node-pod> -- ray summary tasks`
+Output
+```shell
+======== Tasks Summary: 2022-07-28 15:10:24.801670 ========
+Stats:
+------------------------------------
+total_actor_scheduled: 17
+total_actor_tasks: 5
+total_tasks: 0
+
+
+Table (group by func_name):
+------------------------------------
+    FUNC_OR_CLASS_NAME                 STATE_COUNTS    TYPE
+0   ServeController.listen_for_change  RUNNING: 5      ACTOR_TASK
+1   ServeReplica:MangoStand.__init__   FINISHED: 3     ACTOR_CREATION_TASK
+2   HTTPProxyActor.__init__            FINISHED: 2     ACTOR_CREATION_TASK
+3   ServeReplica:PearStand.__init__    FINISHED: 3     ACTOR_CREATION_TASK
+4   ServeReplica:OrangeStand.__init__  FINISHED: 3     ACTOR_CREATION_TASK
+5   ServeReplica:FruitMarket.__init__  FINISHED: 3     ACTOR_CREATION_TASK
+6   ServeReplica:DAGDriver.__init__    FINISHED: 2     ACTOR_CREATION_TASK
+7   ServeController.__init__           FINISHED: 1     ACTOR_CREATION_TASK
+```
 
 ### Delete the RayService instance
 `$ kubectl delete -f config/samples/ray_v1alpha1_rayservice.yaml`
 
 ### Delete the operator
 
-`$ kubectl delete -k "github.com/ray-project/kuberay/ray-operator/config/default"`
+`$ kubectl delete -k "github.com/ray-project/kuberay/ray-operator/config/default"`
diff --git a/helm-chart/kuberay-apiserver/Chart.yaml b/helm-chart/kuberay-apiserver/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 0.3.0
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/helm-chart/kuberay-apiserver/templates/role.yaml b/helm-chart/kuberay-apiserver/templates/role.yaml
@@ -11,6 +11,7 @@ rules:
   - ray.io
   resources:
   - rayclusters
+  - rayjobs
   verbs:
   - create
   - delete
@@ -31,12 +32,17 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - ""
+  resources:
+  - namespaces
+  verbs:
+  - list
 - apiGroups:
   - ""
   resources:
   - events
   verbs:
-  - get 
+  - get
   - list
-  - watch
 {{- end }}
diff --git a/helm-chart/kuberay-apiserver/values.yaml b/helm-chart/kuberay-apiserver/values.yaml
@@ -7,7 +7,7 @@ replicaCount: 1
 name: "kuberay-apiserver"
 image:
   repository: kuberay/apiserver
-  tag: nightly
+  tag: v0.3.0
   pullPolicy: IfNotPresent
 
 ## Install Default RBAC roles and bindings

diff --git a/helm-chart/kuberay-operator/Chart.yaml b/helm-chart/kuberay-operator/Chart.yaml
@@ -2,6 +2,6 @@ apiVersion: v2
 appVersion: "1.0"
 description: A Helm chart for Kubernetes
 name: kuberay-operator
-version: 0.1.0
+version: 0.3.0
 icon: https://github.com/ray-project/ray/raw/master/doc/source/images/ray_header_logo.png
 type: application
diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
@@ -11471,15 +11471,15 @@ spec:
                 required:
                 - headGroupSpec
                 type: object
-              serveDeploymentGraphConfig:
+              serveConfig:
                 description: 'Important: Run "make" to regenerate code after modifying
                   this file'
                 properties:
                   importPath:
                     type: string
                   runtimeEnv:
                     type: string
-                  serveConfigs:
+                  deployments:
                     items:
                       description: ServeConfigSpec defines the desired state of RayService
                         Reference to http://rayserve.org

diff --git a/helm-chart/kuberay-operator/templates/ray_rayjob_editor_role.yaml b/helm-chart/kuberay-operator/templates/ray_rayjob_editor_role.yaml
@@ -0,0 +1,28 @@
+# permissions for end users to edit rayjobs.
+{{- if .Values.rbacEnable }}
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  labels:
+{{ include "kuberay-operator.labels" . | indent 4 }}
+  name: rayjob-editor-role
+rules:
+- apiGroups:
+  - ray.io
+  resources:
+  - rayjobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ray.io
+  resources:
+  - rayjobs/status
+  verbs:
+  - get
+{{- end }}