diff --git a/.github/workflows/test-job.yaml b/.github/workflows/test-job.yaml index be8f69c923..2c21109bed 100644 --- a/.github/workflows/test-job.yaml +++ b/.github/workflows/test-job.yaml @@ -144,7 +144,7 @@ jobs: working-directory: ${{env.working-directory}} - name: Test - run: go test ./... + run: go test ./pkg/... ./cmd/... -race -parallel 4 working-directory: ${{env.working-directory}} - name: Set up Docker diff --git a/.gitignore b/.gitignore index 6bd6eadbbb..cf8ea6e0c2 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,6 @@ # Any file with a .backup extension **/*.backup + +# Any file with a .log extension +**/*.log diff --git a/apiserver/DEVELOPMENT.md b/apiserver/DEVELOPMENT.md index 0978e123ce..922571c265 100644 --- a/apiserver/DEVELOPMENT.md +++ b/apiserver/DEVELOPMENT.md @@ -61,6 +61,35 @@ make build make test ``` +#### End to End Testing + +There are two `make` targets provide execute the end to end test (integration between Kuberay API server and Kuberay Operator): + +* `make e2e-test` executes all the tests defined in the [test/e2e package](./test/e2e/). It uses the cluster defined in `~/.kube/config` to submit the workloads. +* `make local-e2e-test` creates a local kind cluster, builds the Kuberay operator and API server images from the current branch and deploys the operator and API server into the kind cluster. It shuts down the kind cluster upon successful execution of the end to end test. If the tests fail the cluster will be left running and will have to manually be shutdown by executing the `make clean-cluster` + +The `e2e` test targets use two variables to control what version of Ray images to use in the end to end tests: + +* `E2E_API_SERVER_RAY_IMAGE` -- for the ray docker image. Currently set to `rayproject/ray:2.7.0-py310`. On Apple silicon or arm64 development machines the `-aarch64` suffix is added to the image. +* `E2E_API_SERVER_URL` -- for the base URL of the deployed KubeRayAPI server. The default value is: `http://localhost:31888` + +The end to end test targets share the usage of the `GO_TEST_FLAGS`. Overriding the make file variable with a `-v` option allows for both unit and end to end tests to print any output / debug messages. By default, only if there's a test failure those messages are shown. + +The default values of the variables can be overridden using the `-e` make command line arguments. + +Examples: + +```bash +# To run end to end test using default cluster +make e2e-test + +# To run end to end test in fresh cluster. +# Please note that: +# * the cluster created for this test is the same as the cluster created by make cluster. +# * if the end to end tests fail the cluster will still be up and will have to be explicitly shutdown by executing make clean-cluster +make local-e2e-test +``` + #### Swagger UI updates To update the swagger ui files deployed with the Kuberay API server, you'll need to: @@ -117,7 +146,7 @@ make run #### Access -Access the service at `localhost:8888` for http, and `locahost:8887` for the RPC port. +Access the service at `localhost:8888` for http, and `localhost:8887` for the RPC port. ### Kubernetes Deployment @@ -160,10 +189,11 @@ As a convenience for local development the following `make` targets are provided * `make cluster` -- creates a local kind cluster, using the configuration from `hack/kind-cluster-config.yaml`. It creates a port mapping allowing for the service running in the kind cluster to be accessed on `localhost:31888` for HTTP and `localhost:31887` for RPC. * `make clean-cluster` -- deletes the local kind cluster created with `make cluster` * `load-image` -- loads the docker image defined by the `IMG` make variable into the kind cluster. The default value for variable is: `kuberay/apiserver:latest`. The name of the image can be changed by using `make load-image -e IMG=` -* `operator-image` -- Build the operator image to be loaded in your kind cluster. The tag for the operator image is `kuberay/operator:latest`. This step is optional. -* `load-operator-image` -- Load the operator image to the kind cluster created with `create-kind-cluster`. The tag for the operator image is `kuberay/operator:latest`, and the tag can be overridden using `make load-operator-image -E OPERATOR_IMAGE_TAG=`. To use the nightly operator tag, set `OPERATOR_IMAGE_TAG` to `nightly`. +* `operator-image` -- Build the operator image to be loaded in your kind cluster. The operator image build is `kuberay/operator:latest`. The image tag can be overridden from the command line: ( example: `make operator-image -e OPERATOR_IMAGE_TAG=foo`) +* `load-operator-image` -- Load the operator image to the kind cluster created with `make cluster`. It should be used in conjunction with the `deploy-operator targe` * `deploy-operator` -- Deploy operator into your cluster. The tag for the operator image is `kuberay/operator:latest`. * `undeploy-operator` -- Undeploy operator from your cluster +* `load-ray-test-image` -- Load the ray test images into the cluster. When developing and testing with kind you might want to execute these targets together: @@ -173,8 +203,14 @@ make docker-image cluster load-image deploy #To create a new API server image, operator image and deploy them on a new cluster make docker-image operator-image cluster load-image load-operator-image deploy deploy-operator + +#To execute end 2 end tests with a local build operator and verbose output +make local-e2e-test -e GO_TEST_FLAGS="-v" + +#To execute end 2 end test with the nightly build operator +make local-e2e-test -e OPERATOR_IMAGE_TAG=nightly ``` #### Access API Server in the Cluster -Access the service at `localhost:31888` for http and `locahost:31887` for the RPC port. +Access the service at `localhost:31888` for http and `localhost:31887` for the RPC port. diff --git a/apiserver/Makefile b/apiserver/Makefile index bea4f256d6..09fba4adb2 100644 --- a/apiserver/Makefile +++ b/apiserver/Makefile @@ -7,6 +7,18 @@ REPO_ROOT_BIN := $(REPO_ROOT)/bin IMG_TAG ?=latest IMG ?= kuberay/apiserver:$(IMG_TAG) +# Allow for additional test flags (-v, etc) +GO_TEST_FLAGS ?= +# Ray docker images to use for end to end tests based upon the architecture +# for arm64 environments (Apple silicon included) pull the architecture specific image +ifeq (arm64,$(shell go env GOARCH)) +E2E_API_SERVER_RAY_IMAGE ?=rayproject/ray:2.7.0-py310-aarch64 +else +E2E_API_SERVER_RAY_IMAGE ?=rayproject/ray:2.7.0-py310 +endif +# Kuberay API Server base URL to use in end to end tests +E2E_API_SERVER_URL ?=http://localhost:31888 + # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) ifeq (,$(shell go env GOBIN)) GOBIN=$(shell go env GOPATH)/bin @@ -43,42 +55,102 @@ help: ## Display this help. ##@ Development +.PHONY: fmt: ## Run go fmt against code. go fmt ./... +.PHONY: vet vet: ## Run go vet against code. go vet ./... +.PHONY: fumpt fumpt: gofumpt ## Run gofmtumpt against code. $(GOFUMPT) -l -w . +.PHONY: imports imports: goimports ## Run goimports against code. $(GOIMPORTS) -l -w . -test: fmt vet fumpt imports lint ## Run unit tests. - go test ./... -race -coverprofile ray-kube-api-server-coverage.out - +.PHONY: lint lint: golangci-lint fmt vet fumpt imports ## Run the linter. $(GOLANGCI_LINT) run --timeout=3m -##@ Build - build: fmt vet fumpt imports lint ## Build api server binary. go build -o ${REPO_ROOT_BIN}/kuberay-apiserver cmd/main.go run: fmt vet fumpt imports lint ## Run the api server from your host. go run -race cmd/main.go -localSwaggerPath ${REPO_ROOT}/proto/swagger -docker-image: test ## Build image with the api server. - ${ENGINE} build -t ${IMG} -f Dockerfile .. - -docker-push: ## Push image with the api server. - ${ENGINE} push ${IMG} - .PHONY: build-swagger build-swagger: go-bindata cd $(REPO_ROOT) && $(GOBINDATA) --nocompress --pkg swagger -o apiserver/pkg/swagger/datafile.go third_party/swagger-ui/... +##@ Testing + +.PHONY: test +test: fmt vet fumpt imports lint ## Run all unit tests. + go test ./pkg/... ./cmd/... $(GO_TEST_FLAGS) -race -coverprofile ray-kube-api-server-coverage.out -parallel 4 + +.PHONY: e2e-test +e2e-test: ## Run end to end tests using a pre-exiting cluster. + go test ./test/e2e/... $(GO_TEST_FLAGS) -timeout 60m -race -count=1 -parallel 4 + +.PHONY: local-e2e-test ## Run end to end tests on newly created cluster. +local-e2e-test: docker-image operator-image cluster load-image load-operator-image deploy-operator deploy load-ray-test-image e2e-test clean-cluster ## Run end to end tests, create a fresh kind cluster will all components deployed. + +##@ Testing Setup +KIND_CONFIG ?= hack/kind-cluster-config.yaml +KIND_CLUSTER_NAME ?= ray-api-server-cluster +OPERATOR_IMAGE_TAG ?= latest +.PHONY: cluster +cluster: kind ## Start kind development cluster. + $(KIND) create cluster -n $(KIND_CLUSTER_NAME) --config $(KIND_CONFIG) + +.PHONY: clean-cluster +clean-cluster: kind ## Delete kind development cluster. + $(KIND) delete cluster -n $(KIND_CLUSTER_NAME) + +.PHONY: load-image +load-image: ## Load the api server image to the kind cluster created with create-kind-cluster. + $(KIND) load docker-image $(IMG) -n $(KIND_CLUSTER_NAME) + +.PHONY: operator-image +operator-image: ## Build the operator image to be loaded in your kind cluster. + cd ../ray-operator && $(MAKE) docker-image -e IMG=kuberay/operator:$(OPERATOR_IMAGE_TAG) + +.PHONY: deploy-operator +deploy-operator: ## Deploy operator via helm into the K8s cluster specified in ~/.kube/config. +# Note that you should make your operatorimage available by either pushing it to an image registry, such as DockerHub or Quay, or by loading the image into the Kubernetes cluster. +# If you are using a Kind cluster for development, you can run `make load-image` to load the newly built image into the Kind cluster. + helm upgrade --install raycluster ../helm-chart/kuberay-operator --wait \ + --set image.tag=${OPERATOR_IMAGE_TAG} --set image.pullPolicy=IfNotPresent + +.PHONY: undeploy-operator +undeploy-operator: ## Undeploy operator via helm from the K8s cluster specified in ~/.kube/config. + helm uninstall raycluster --wait + +.PHONY: load-operator-image +load-operator-image: ## Load the operator image to the kind cluster created with make cluster. +ifneq (${OPERATOR_IMAGE_TAG}, latest) + $(ENGINE) pull kuberay/operator:$(OPERATOR_IMAGE_TAG) +endif + $(KIND) load docker-image kuberay/operator:$(OPERATOR_IMAGE_TAG) -n $(KIND_CLUSTER_NAME) + +.PHONY: load-ray-test-image +load-ray-test-image: ## Load the ray test images + $(ENGINE) pull $(E2E_API_SERVER_RAY_IMAGE) + $(KIND) load docker-image $(E2E_API_SERVER_RAY_IMAGE) -n $(KIND_CLUSTER_NAME) + $(ENGINE) pull rayproject/ray:latest + $(KIND) load docker-image rayproject/ray:latest -n $(KIND_CLUSTER_NAME) + +##@ Docker Build + +docker-image: test ## Build image with the api server. + $(ENGINE) build -t ${IMG} -f Dockerfile .. + +docker-push: ## Push image with the api server. + $(ENGINE) push ${IMG} + ##@ Deployment .PHONY: install install: kustomize ## Install the kuberay api server to the K8s cluster specified in ~/.kube/config. @@ -100,7 +172,7 @@ deploy: ## Deploy via helm the kuberay api server to the K8s cluster specified i undeploy: ## Undeploy via helm the kuberay api server to the K8s cluster specified in ~/.kube/config. helm uninstall kuberay-apiserver --wait -##@ Development Tools +##@ Development Tools Setup ## Location to install dependencies to $(REPO_ROOT_BIN): @@ -118,7 +190,7 @@ GOBINDATA ?= $(REPO_ROOT_BIN)/go-bindata ## Tool Versions KUSTOMIZE_VERSION ?= v3.8.7 GOFUMPT_VERSION ?= v0.3.1 -GOIMPORTS_VERSION ?= latest +GOIMPORTS_VERSION ?= v0.14.0 GOLANGCI_LINT_VERSION ?= v1.54.1 KIND_VERSION ?= v0.19.0 GOBINDATA_VERSION ?= v4.0.2 @@ -165,39 +237,3 @@ clean-dev-tools: ## Remove all development tools rm -f $(REPO_ROOT_BIN)/goimports rm -f $(REPO_ROOT_BIN)/kind rm -f $(REPO_ROOT_BIN)/go-bindata - - -##@ Testing Setup and Tools -KIND_CONFIG ?= hack/kind-cluster-config.yaml -KIND_CLUSTER_NAME ?= ray-api-server-cluster -OPERATOR_IMAGE_TAG ?= latest -.PHONY: cluster -cluster: kind ## Start kind development cluster. - $(KIND) create cluster -n $(KIND_CLUSTER_NAME) --config $(KIND_CONFIG) - -.PHONY: clean-cluster -clean-cluster: kind ## Delete kind development cluster. - $(KIND) delete cluster -n $(KIND_CLUSTER_NAME) - -.PHONY: load-image -load-image: ## Load the api server image to the kind cluster created with create-kind-cluster. - $(KIND) load docker-image $(IMG) -n $(KIND_CLUSTER_NAME) - -.PHONY: operator-image -operator-image: ## Build the operator image to be loaded in your kind cluster. - cd ../ray-operator && $(MAKE) docker-image -e IMG=kuberay/operator:$(OPERATOR_IMAGE_TAG) - -.PHONY: deploy-operator -deploy-operator: ## Deploy operator via helm into the K8s cluster specified in ~/.kube/config. -# Note that you should make your operatorimage available by either pushing it to an image registry, such as DockerHub or Quay, or by loading the image into the Kubernetes cluster. -# If you are using a Kind cluster for development, you can run `make load-image` to load the newly built image into the Kind cluster. - helm upgrade --install raycluster ../helm-chart/kuberay-operator --wait \ - --set image.tag=${OPERATOR_IMAGE_TAG} --set image.pullPolicy=IfNotPresent - -.PHONY: undeploy-operator -undeploy-operator: ## Undeploy operator via helm from the K8s cluster specified in ~/.kube/config. - helm uninstall raycluster --wait - -.PHONY: load-operator-image -load-operator-image: ## Load the operator image to the kind cluster created with create-kind-cluster. - $(KIND) load docker-image kuberay/operator:$(OPERATOR_IMAGE_TAG) -n $(KIND_CLUSTER_NAME) diff --git a/apiserver/Volumes.md b/apiserver/Volumes.md index cf9eee5d3d..c7cb99f03b 100644 --- a/apiserver/Volumes.md +++ b/apiserver/Volumes.md @@ -38,8 +38,7 @@ The code below gives an example of hostPath volume definition: A Persistent Volume Claim (PVC) is a request for storage by a user. It is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes (e.g., they can be mounted `ReadWriteOnce`, `ReadOnlyMany` or `ReadWriteMany`). -The caveat of using PVC volumes is that the same PVC is mounted to all nodes. As a result only PVCs with access -mode `ReadOnlyMany` can be used in this case. +The caveat of using PVC volumes is that the same PVC is mounted to all nodes. As a result only PVCs with access mode `ReadOnlyMany` can be used in this case. The code below gives an example of PVC volume definition: @@ -121,7 +120,7 @@ The code below gives an example of secret volume definition: An emptyDir volume is first created when a Pod is assigned to a node, and exists as long as that Pod is running on that node. As the name says, the emptyDir volume is initially empty. All containers in the Pod can read and write the same files in the emptyDir volume, though that volume can be mounted at the same or different paths in each container. When a Pod is removed from a node for any reason, the data in the emptyDir is deleted permanently. -The code below gives an example of empydir volume definition: +The code below gives an example of empty directory volume definition: ```json { diff --git a/apiserver/go.mod b/apiserver/go.mod index ad7fa77ad1..9874544b4d 100644 --- a/apiserver/go.mod +++ b/apiserver/go.mod @@ -20,12 +20,16 @@ require ( ) require ( + github.com/dustinkirkland/golang-petname v0.0.0-20230626224747-e794b9370d49 github.com/elazarl/go-bindata-assetfs v1.0.1 github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.6.0 + google.golang.org/genproto v0.0.0-20210909211513-a8c4777a87af ) +require github.com/pmezard/go-difflib v1.0.0 // indirect + require ( github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535 // indirect github.com/beorn7/perks v1.0.1 // indirect @@ -48,7 +52,6 @@ require ( github.com/mitchellh/mapstructure v1.4.1 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_model v0.2.0 // indirect github.com/prometheus/common v0.28.0 // indirect github.com/prometheus/procfs v0.6.0 // indirect @@ -61,7 +64,6 @@ require ( golang.org/x/text v0.13.0 // indirect golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/genproto v0.0.0-20210909211513-a8c4777a87af // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/apiserver/go.sum b/apiserver/go.sum index 30a165dce7..1a7bfebe32 100644 --- a/apiserver/go.sum +++ b/apiserver/go.sum @@ -85,6 +85,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/docker/go-units v0.3.3/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/dustinkirkland/golang-petname v0.0.0-20230626224747-e794b9370d49 h1:6SNWi8VxQeCSwmLuTbEvJd7xvPmdS//zvMBWweZLgck= +github.com/dustinkirkland/golang-petname v0.0.0-20230626224747-e794b9370d49/go.mod h1:V+Qd57rJe8gd4eiGzZyg4h54VLHmYVVw54iMnlAMrF8= github.com/elazarl/go-bindata-assetfs v1.0.1 h1:m0kkaHRKEu7tUIUFVwhGGGYClXvyl4RE03qmvRTNfbw= github.com/elazarl/go-bindata-assetfs v1.0.1/go.mod h1:v+YaWX3bdea5J/mo8dSETolEo7R71Vk1u8bnjau5yw4= github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= diff --git a/apiserver/hack/kind-cluster-config.yaml b/apiserver/hack/kind-cluster-config.yaml index e0c4aeaa16..53aef5cc7d 100644 --- a/apiserver/hack/kind-cluster-config.yaml +++ b/apiserver/hack/kind-cluster-config.yaml @@ -10,17 +10,6 @@ nodes: kubeletExtraArgs: node-labels: "ingress-ready=true" extraPortMappings: - - containerPort: 30265 - hostPort: 8265 - listenAddress: "0.0.0.0" - protocol: tcp - - containerPort: 30001 - hostPort: 10001 - listenAddress: "0.0.0.0" - protocol: tcp - - containerPort: 8000 - hostPort: 8000 - listenAddress: "0.0.0.0" - containerPort: 31888 hostPort: 31888 listenAddress: "0.0.0.0" @@ -31,3 +20,5 @@ nodes: image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb - role: worker image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb +- role: worker + image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb diff --git a/apiserver/pkg/http/client.go b/apiserver/pkg/http/client.go new file mode 100644 index 0000000000..66a4f41a8e --- /dev/null +++ b/apiserver/pkg/http/client.go @@ -0,0 +1,551 @@ +package http + +import ( + "bytes" + "errors" + "fmt" + "io" + "net/http" + + api "github.com/ray-project/kuberay/proto/go_client" + rpcStatus "google.golang.org/genproto/googleapis/rpc/status" + "google.golang.org/protobuf/encoding/protojson" +) + +type KuberayAPIServerClient struct { + httpClient *http.Client + baseURL string + marshaler *protojson.MarshalOptions + unmarshaler *protojson.UnmarshalOptions +} + +type KuberayAPIServerClientError struct { + HTTPStatusCode int +} + +func (krce *KuberayAPIServerClientError) Error() string { + return fmt.Sprintf("kuberay api server request failed with HTTP status (%d)", krce.HTTPStatusCode) +} + +func IsNotFoundError(err error) bool { + if err != nil { + apiServerError := &KuberayAPIServerClientError{} + if errors.As(err, &apiServerError); apiServerError.HTTPStatusCode == http.StatusNotFound { + return true + } + } + return false +} + +func NewKuberayAPIServerClient(baseURL string, httpClient *http.Client) *KuberayAPIServerClient { + return &KuberayAPIServerClient{ + httpClient: httpClient, + baseURL: baseURL, + marshaler: &protojson.MarshalOptions{ + Multiline: true, + Indent: " ", + AllowPartial: false, + UseProtoNames: true, + UseEnumNumbers: false, + EmitUnpopulated: false, + Resolver: nil, + }, + unmarshaler: &protojson.UnmarshalOptions{ + AllowPartial: false, + DiscardUnknown: false, + Resolver: nil, + }, + } +} + +// CreateComputeTemplate creates a new compute template. +func (krc *KuberayAPIServerClient) CreateComputeTemplate(request *api.CreateComputeTemplateRequest) (*api.ComputeTemplate, *rpcStatus.Status, error) { + createURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/compute_templates" + + bytez, err := krc.marshaler.Marshal(request.ComputeTemplate) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal api.ComputeTemplate to JSON: %w", err) + } + + httpRequest, err := krc.createHttpRequest("POST", createURL, bytes.NewReader(bytez)) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", createURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + httpRequest.Header.Add("Content-Type", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, createURL) + if err != nil { + return nil, status, err + } + computeTemplate := &api.ComputeTemplate{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, computeTemplate); err != nil { + return nil, status, nil + } + + return computeTemplate, nil, nil +} + +// DeleteComputeTemplate deletes a compute template. +func (krc *KuberayAPIServerClient) DeleteComputeTemplate(request *api.DeleteComputeTemplateRequest) (*rpcStatus.Status, error) { + deleteURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/compute_templates/" + request.Name + return krc.doDelete(deleteURL) +} + +// Finds a specific compute template by its name and namespace. +func (krc *KuberayAPIServerClient) GetComputeTemplate(request *api.GetComputeTemplateRequest) (*api.ComputeTemplate, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/compute_templates/" + request.Name + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + computeTemplate := &api.ComputeTemplate{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, computeTemplate); err != nil { + return nil, status, nil + } + return computeTemplate, nil, nil +} + +// GetAllComputeTemplates finds all compute templates in all namespaces. +func (krc *KuberayAPIServerClient) GetAllComputeTemplates() (*api.ListAllComputeTemplatesResponse, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/compute_templates" + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.ListAllComputeTemplatesResponse{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// GetAllComputeTemplatesInNamespace Finds all compute templates in a given namespace. +func (krc *KuberayAPIServerClient) GetAllComputeTemplatesInNamespace(request *api.ListComputeTemplatesRequest) (*api.ListComputeTemplatesResponse, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/compute_templates" + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.ListComputeTemplatesResponse{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// CreateCluster creates a new cluster. +func (krc *KuberayAPIServerClient) CreateCluster(request *api.CreateClusterRequest) (*api.Cluster, *rpcStatus.Status, error) { + createURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/clusters" + + bytez, err := krc.marshaler.Marshal(request.Cluster) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal api.Cluster to JSON: %w", err) + } + + httpRequest, err := krc.createHttpRequest("POST", createURL, bytes.NewReader(bytez)) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", createURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + httpRequest.Header.Add("Content-Type", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, createURL) + if err != nil { + return nil, status, err + } + cluster := &api.Cluster{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, cluster); err != nil { + return nil, status, nil + } + return cluster, nil, nil +} + +// DeleteCluster deletes a cluster +func (krc *KuberayAPIServerClient) DeleteCluster(request *api.DeleteClusterRequest) (*rpcStatus.Status, error) { + deleteURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/clusters/" + request.Name + return krc.doDelete(deleteURL) +} + +// GetCluster finds a specific Cluster by ID. +func (krc *KuberayAPIServerClient) GetCluster(request *api.GetClusterRequest) (*api.Cluster, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/clusters/" + request.Name + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + cluster := &api.Cluster{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, cluster); err != nil { + return nil, status, nil + } + return cluster, nil, nil +} + +// ListCluster finds all clusters in a given namespace. +func (krc *KuberayAPIServerClient) ListClusters(request *api.ListClustersRequest) (*api.ListClustersResponse, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/clusters" + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.ListClustersResponse{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// ListAllClusters finds all Clusters in all namespaces. Supports pagination, and sorting on certain fields. +func (krc *KuberayAPIServerClient) ListAllClusters() (*api.ListAllClustersResponse, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/clusters" + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.ListAllClustersResponse{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// CreateRayJob creates a new job. +func (krc *KuberayAPIServerClient) CreateRayJob(request *api.CreateRayJobRequest) (*api.RayJob, *rpcStatus.Status, error) { + createURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/jobs" + bytez, err := krc.marshaler.Marshal(request.Job) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal api.Cluster to JSON: %w", err) + } + + httpRequest, err := krc.createHttpRequest("POST", createURL, bytes.NewReader(bytez)) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", createURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + httpRequest.Header.Add("Content-Type", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, createURL) + if err != nil { + return nil, status, err + } + rayJob := &api.RayJob{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, rayJob); err != nil { + return nil, status, nil + } + return rayJob, nil, nil +} + +// GetRayJob finds a specific job by its name and namespace. +func (krc *KuberayAPIServerClient) GetRayJob(request *api.GetRayJobRequest) (*api.RayJob, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/jobs/" + request.Name + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + rayJob := &api.RayJob{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, rayJob); err != nil { + return nil, status, nil + } + return rayJob, nil, nil +} + +// Finds all job in a given namespace. +func (krc *KuberayAPIServerClient) ListRayJobs(request *api.ListRayJobsRequest) (*api.ListRayJobsResponse, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/jobs" + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.ListRayJobsResponse{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// ListAllRayJobs Finds all job in all namespaces. +func (krc *KuberayAPIServerClient) ListAllRayJobs() (*api.ListAllRayJobsResponse, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/jobs" + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.ListAllRayJobsResponse{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// Deletes a job by its name and namespace. +func (krc *KuberayAPIServerClient) DeleteRayJob(request *api.DeleteRayJobRequest) (*rpcStatus.Status, error) { + deleteURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/jobs/" + request.Name + return krc.doDelete(deleteURL) +} + +// CreateRayService create a new ray serve. +func (krc *KuberayAPIServerClient) CreateRayService(request *api.CreateRayServiceRequest) (*api.RayService, *rpcStatus.Status, error) { + createURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/services" + bytez, err := krc.marshaler.Marshal(request.Service) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal api.Cluster to JSON: %w", err) + } + + httpRequest, err := krc.createHttpRequest("POST", createURL, bytes.NewReader(bytez)) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", createURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + httpRequest.Header.Add("Content-Type", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, createURL) + if err != nil { + return nil, status, err + } + rayService := &api.RayService{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, rayService); err != nil { + return nil, status, nil + } + return rayService, nil, nil +} + +// UpdateRayService updates a ray serve service. +func (krc *KuberayAPIServerClient) UpdateRayService(request *api.UpdateRayServiceRequest) (*api.RayService, *rpcStatus.Status, error) { + updateURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/services/" + request.Name + bytez, err := krc.marshaler.Marshal(request.Service) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal api.Cluster to JSON: %w", err) + } + + httpRequest, err := krc.createHttpRequest("PUT", updateURL, bytes.NewReader(bytez)) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", updateURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + httpRequest.Header.Add("Content-Type", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, updateURL) + if err != nil { + return nil, status, err + } + rayService := &api.RayService{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, rayService); err != nil { + return nil, status, nil + } + return rayService, nil, nil +} + +// Update a ray serve configs. +// Patch mode update without possible deletion the existing raycluster under the hood. +// only support update the service configs and worker. +func (krc *KuberayAPIServerClient) UpdateRayServiceConfigs(request *api.UpdateRayServiceConfigsRequest) (*api.RayService, *rpcStatus.Status, error) { + updateURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/services/" + request.Name + "/configs" + bytez, err := krc.marshaler.Marshal(request.UpdateService) + if err != nil { + return nil, nil, fmt.Errorf("failed to marshal api.Cluster to JSON: %w", err) + } + + httpRequest, err := krc.createHttpRequest("PATCH", updateURL, bytes.NewReader(bytez)) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", updateURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + httpRequest.Header.Add("Content-Type", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, updateURL) + if err != nil { + return nil, status, err + } + rayService := &api.RayService{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, rayService); err != nil { + return nil, status, nil + } + return rayService, nil, nil +} + +// Find a specific ray serve by name and namespace. +func (krc *KuberayAPIServerClient) GetRayService(request *api.GetRayServiceRequest) (*api.RayService, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/services/" + request.Name + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.RayService{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// Finds all ray services in a given namespace. Supports pagination, and sorting on certain fields. +func (krc *KuberayAPIServerClient) ListRayServices(request *api.ListRayServicesRequest) (*api.ListRayServicesResponse, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/services" + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.ListRayServicesResponse{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// Finds all ray services in a given namespace. Supports pagination, and sorting on certain fields. +func (krc *KuberayAPIServerClient) ListAllRayServices() (*api.ListAllRayServicesResponse, *rpcStatus.Status, error) { + getURL := krc.baseURL + "/apis/v1/services" + httpRequest, err := krc.createHttpRequest("GET", getURL, nil) + if err != nil { + return nil, nil, fmt.Errorf("failed to create http request for url '%s': %w", getURL, err) + } + + httpRequest.Header.Add("Accept", "application/json") + + bodyBytes, status, err := krc.executeRequest(httpRequest, getURL) + if err != nil { + return nil, status, err + } + response := &api.ListAllRayServicesResponse{} + if err := krc.unmarshaler.Unmarshal(bodyBytes, response); err != nil { + return nil, status, nil + } + return response, nil, nil +} + +// DeleteRayService deletes a ray service by its name and namespace +func (krc *KuberayAPIServerClient) DeleteRayService(request *api.DeleteRayServiceRequest) (*rpcStatus.Status, error) { + deleteURL := krc.baseURL + "/apis/v1/namespaces/" + request.Namespace + "/services/" + request.Name + return krc.doDelete(deleteURL) +} + +func (krc *KuberayAPIServerClient) doDelete(deleteURL string) (*rpcStatus.Status, error) { + httpRequest, err := krc.createHttpRequest("DELETE", deleteURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to create http request for url '%s': %w", deleteURL, err) + } + httpRequest.Header.Add("Accept", "application/json") + _, status, err := krc.executeRequest(httpRequest, deleteURL) + return status, err +} + +func (krc *KuberayAPIServerClient) executeRequest(httpRequest *http.Request, URL string) ([]byte, *rpcStatus.Status, error) { + response, err := krc.httpClient.Do(httpRequest) + if err != nil { + return nil, nil, fmt.Errorf("failed to execute http request for url '%s': %w", URL, err) + } + defer response.Body.Close() + bodyBytes, err := io.ReadAll(response.Body) + if err != nil { + return nil, nil, fmt.Errorf("failed to read response body bytes: %w", err) + } + if response.StatusCode != http.StatusOK { + status, err := krc.extractStatus(bodyBytes) + if err != nil { + return nil, nil, err + } + return nil, status, &KuberayAPIServerClientError{ + HTTPStatusCode: response.StatusCode, + } + } + return bodyBytes, nil, nil +} + +func (krc *KuberayAPIServerClient) extractStatus(bodyBytes []byte) (*rpcStatus.Status, error) { + status := &rpcStatus.Status{} + err := krc.unmarshaler.Unmarshal(bodyBytes, status) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal status object: %w", err) + } + return status, nil +} + +func (krc *KuberayAPIServerClient) createHttpRequest(method string, endPoint string, body io.Reader) (*http.Request, error) { + req, err := http.NewRequest(method, endPoint, body) + if err != nil { + return nil, err + } + return req, nil +} diff --git a/apiserver/pkg/server/cluster_server.go b/apiserver/pkg/server/cluster_server.go index c3b0b8c1eb..abddb4cd12 100644 --- a/apiserver/pkg/server/cluster_server.go +++ b/apiserver/pkg/server/cluster_server.go @@ -153,23 +153,8 @@ func ValidateCreateClusterRequest(request *api.CreateClusterRequest) error { return util.NewInvalidInputError("User who create the cluster is empty. Please specify a valid value.") } - if len(request.Cluster.ClusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") - } - - for index, spec := range request.Cluster.ClusterSpec.WorkerGroupSpec { - if len(spec.GroupName) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) - } - if len(spec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) - } - if spec.MaxReplicas == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) - } - if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) - } + if err := ValidateClusterSpec(request.Cluster.ClusterSpec); err != nil { + return err } return nil diff --git a/apiserver/pkg/server/job_server.go b/apiserver/pkg/server/job_server.go index 3b6ddb2ee0..2944872ef3 100644 --- a/apiserver/pkg/server/job_server.go +++ b/apiserver/pkg/server/job_server.go @@ -128,23 +128,8 @@ func ValidateCreateJobRequest(request *api.CreateRayJobRequest) error { return nil } - if len(request.Job.ClusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") - } - - for index, spec := range request.Job.ClusterSpec.WorkerGroupSpec { - if len(spec.GroupName) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) - } - if len(spec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) - } - if spec.MaxReplicas == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) - } - if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) - } + if err := ValidateClusterSpec(request.Job.ClusterSpec); err != nil { + return err } return nil diff --git a/apiserver/pkg/server/serve_server.go b/apiserver/pkg/server/serve_server.go index c956dcf417..f347738d84 100644 --- a/apiserver/pkg/server/serve_server.go +++ b/apiserver/pkg/server/serve_server.go @@ -2,6 +2,7 @@ package server import ( "context" + "strings" "github.com/ray-project/kuberay/apiserver/pkg/manager" "github.com/ray-project/kuberay/apiserver/pkg/model" @@ -155,6 +156,9 @@ func (s *RayServiceServer) DeleteRayService(ctx context.Context, request *api.De } func ValidateCreateServiceRequest(request *api.CreateRayServiceRequest) error { + if request == nil { + return util.NewInvalidInputError("A non nill request is expected") + } if request.Namespace == "" { return util.NewInvalidInputError("Namespace is empty. Please specify a valid value.") } @@ -175,25 +179,23 @@ func ValidateCreateServiceRequest(request *api.CreateRayServiceRequest) error { return util.NewInvalidInputError("User who create the Service is empty. Please specify a valid value.") } - if len(request.Service.ClusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") + if request.Service.ServeDeploymentGraphSpec == nil && strings.TrimSpace(request.Service.ServeConfig_V2) == "" { + return util.NewInvalidInputError("A serve config v2 or deployment graph specs is required. Please specify either.") } - for index, spec := range request.Service.ClusterSpec.WorkerGroupSpec { - if len(spec.GroupName) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) - } - if len(spec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) - } - if spec.MaxReplicas == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) - } - if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) + if request.Service.ServeDeploymentGraphSpec != nil && strings.TrimSpace(request.Service.ServeConfig_V2) != "" { + return util.NewInvalidInputError("Both serve config v2 or deployment graph specs were specified. Please specify one or the other.") + } + if strings.TrimSpace(request.Service.ServeConfig_V2) == "" { + if err := ValidateServeDeploymentGraphSpec(request.Service.ServeDeploymentGraphSpec); err != nil { + return err } } + if err := ValidateClusterSpec(request.Service.ClusterSpec); err != nil { + return err + } + return nil } @@ -220,60 +222,56 @@ func ValidateUpdateServiceRequest(request *api.UpdateRayServiceRequest) error { if request.Service.User == "" { return util.NewInvalidInputError("User who create the Service is empty. Please specify a valid value.") } - - if len(request.Service.ClusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") + if request.Service.ServeDeploymentGraphSpec == nil && strings.TrimSpace(request.Service.ServeConfig_V2) == "" { + return util.NewInvalidInputError("A serve config v2 or deployment graph specs is required. Please specify either.") } - for index, spec := range request.Service.ClusterSpec.WorkerGroupSpec { - if len(spec.GroupName) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) - } - if len(spec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) - } - if spec.MaxReplicas == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) - } - if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) + if request.Service.ServeDeploymentGraphSpec != nil && strings.TrimSpace(request.Service.ServeConfig_V2) != "" { + return util.NewInvalidInputError("Both serve config v2 or deployment graph specs were specified. Please specify one or the other.") + } + if strings.TrimSpace(request.Service.ServeConfig_V2) == "" { + if err := ValidateServeDeploymentGraphSpec(request.Service.ServeDeploymentGraphSpec); err != nil { + return err } } + if err := ValidateClusterSpec(request.Service.ClusterSpec); err != nil { + return err + } + return nil } func ValidateUpdateRayServiceConfigsRequest(request *api.UpdateRayServiceConfigsRequest) error { + if request == nil { + return util.NewInvalidInputError("Update ray service config request can't be nil.") + } if request.Name == "" { - return util.NewInvalidInputError("ray service name is empty. Please specify a valid value.") + return util.NewInvalidInputError("Update ray service config request ray service name is empty. Please specify a valid value.") } if request.Namespace == "" { - return util.NewInvalidInputError("ray service namespace is empty. Please specify a valid value.") + return util.NewInvalidInputError("Update ray service config request ray service namespace is empty. Please specify a valid value.") } updateServiceBody := request.GetUpdateService() if updateServiceBody == nil || (updateServiceBody.WorkerGroupUpdateSpec == nil && updateServiceBody.ServeDeploymentGraphSpec == nil) { - return util.NewInvalidInputError("update spec is empty. Nothing to update.") + return util.NewInvalidInputError("Update ray service config request spec is empty. Nothing to update.") } if updateServiceBody.WorkerGroupUpdateSpec != nil { - for _, spec := range updateServiceBody.WorkerGroupUpdateSpec { + for index, spec := range updateServiceBody.WorkerGroupUpdateSpec { + if strings.TrimSpace(spec.GroupName) == "" { + return util.NewInvalidInputError("Update ray service config request worker group update spec at index %d is missing a name, Please specify a valid value.", index) + } if spec.Replicas <= 0 || spec.MinReplicas <= 0 || spec.MaxReplicas <= 0 { - return util.NewInvalidInputError("input invalid, replicas, minReplicas and maxReplicas must be greater than 0.") + return util.NewInvalidInputError("Update ray service config request worker group update spec at index %d has invalid values, replicas, minReplicas and maxReplicas must be greater than 0.", index) } if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %s MinReplica > MaxReplicas. Please specify a valid value.", spec.GroupName) + return util.NewInvalidInputError("Update ray service config request worker group update spec with name '%s' has MinReplica > MaxReplicas. Please specify a valid value.", spec.GroupName) } } } if updateServiceBody.ServeDeploymentGraphSpec != nil { - for _, spec := range updateServiceBody.ServeDeploymentGraphSpec.ServeConfigs { - if spec.Replicas <= 0 { - return util.NewInvalidInputError("input invalid, replicas must be greater than 0.") - } - if spec.ActorOptions != nil { - if spec.ActorOptions.CpusPerActor <= 0 && spec.ActorOptions.GpusPerActor <= 0 && spec.ActorOptions.MemoryPerActor <= 0 { - return util.NewInvalidInputError("input invalid, cpusPerActor, gpusPerActor and memoryPerActor must be greater than 0.") - } - } + if err := ValidateServeDeploymentGraphSpec(updateServiceBody.ServeDeploymentGraphSpec); err != nil { + return err } } return nil diff --git a/apiserver/pkg/server/validations.go b/apiserver/pkg/server/validations.go new file mode 100644 index 0000000000..35ffe3fe5d --- /dev/null +++ b/apiserver/pkg/server/validations.go @@ -0,0 +1,65 @@ +package server + +import ( + "strings" + + "github.com/ray-project/kuberay/apiserver/pkg/util" + api "github.com/ray-project/kuberay/proto/go_client" +) + +// ValidateClusterSpec validates that the *api.ClusterSpec is not nil and +// has all the required fields +func ValidateClusterSpec(clusterSpec *api.ClusterSpec) error { + if clusterSpec == nil { + return util.NewInvalidInputError("A ClusterSpec object is required. Please specify one.") + } + if clusterSpec.HeadGroupSpec == nil { + return util.NewInvalidInputError("Cluster Spec Object requires HeadGroupSpec to be populated. Please specify one.") + } + if len(clusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { + return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") + } + if len(clusterSpec.HeadGroupSpec.RayStartParams) == 0 { + return util.NewInvalidInputError("HeadGroupSpec RayStartParams is empty. Please specify values.") + } + + for index, spec := range clusterSpec.WorkerGroupSpec { + if len(spec.GroupName) == 0 { + return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) + } + if len(spec.ComputeTemplate) == 0 { + return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) + } + if spec.MaxReplicas == 0 { + return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) + } + if spec.MinReplicas > spec.MaxReplicas { + return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) + } + } + return nil +} + +// ValidateServeDeploymentGraphSpec validates that the ServeDeploymentGraphSpec has the CRD required fields +func ValidateServeDeploymentGraphSpec(deploymentGraphSpec *api.ServeDeploymentGraphSpec) error { + if deploymentGraphSpec == nil { + return util.NewInvalidInputError("ServeDeploymentGraphSpec must be not nil. Please specify a valid object.") + } + if strings.TrimSpace(deploymentGraphSpec.ImportPath) == "" { + return util.NewInvalidInputError("ServeDeploymentGraphSpec import path must have a value. Please specify valid value.") + } + for index, serveConfig := range deploymentGraphSpec.ServeConfigs { + if strings.TrimSpace(serveConfig.DeploymentName) == "" { + return util.NewInvalidInputError("ServeConfig %d deployment name is empty. Please specify a valid value.", index) + } + if serveConfig.Replicas <= 0 { + return util.NewInvalidInputError("ServeConfig %d replicas must be greater than 0. Please specify a valid value.", index) + } + if serveConfig.ActorOptions != nil { + if serveConfig.ActorOptions.CpusPerActor <= 0 && serveConfig.ActorOptions.GpusPerActor <= 0 && serveConfig.ActorOptions.MemoryPerActor <= 0 { + return util.NewInvalidInputError("ServeConfig %d invalid ActorOptions, cpusPerActor, gpusPerActor and memoryPerActor must be greater than 0.", index) + } + } + } + return nil +} diff --git a/apiserver/pkg/server/validations_test.go b/apiserver/pkg/server/validations_test.go new file mode 100644 index 0000000000..129387ecb5 --- /dev/null +++ b/apiserver/pkg/server/validations_test.go @@ -0,0 +1,721 @@ +package server_test + +import ( + "testing" + + "github.com/ray-project/kuberay/apiserver/pkg/server" + "github.com/ray-project/kuberay/apiserver/pkg/util" + api "github.com/ray-project/kuberay/proto/go_client" + "github.com/stretchr/testify/require" +) + +func TestValidateClusterSpec(t *testing.T) { + tests := []struct { + name string + clusterSpec *api.ClusterSpec + expectedError error + }{ + { + name: "A valid cluster spec", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group-1", + ComputeTemplate: "group-1-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + { + GroupName: "group-2", + ComputeTemplate: "group-2-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + expectedError: nil, + }, + { + name: "A nill cluster spec", + clusterSpec: nil, + expectedError: util.NewInvalidInputError("A ClusterSpec object is required. Please specify one."), + }, + { + name: "An empty cluster spec", + clusterSpec: &api.ClusterSpec{}, + expectedError: util.NewInvalidInputError("Cluster Spec Object requires HeadGroupSpec to be populated. Please specify one."), + }, + { + name: "An empty head group cluster spec", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{}, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + expectedError: util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value."), + }, + { + name: "A head group without ray start parameters", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: nil, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + expectedError: util.NewInvalidInputError("HeadGroupSpec RayStartParams is empty. Please specify values."), + }, + { + name: "An empty worker group", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + expectedError: nil, + }, + { + name: "Two empty worker group specs", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + {}, + {}, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 group name is empty. Please specify a valid value."), + }, + { + name: "A worker group spec without a group name", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "", + ComputeTemplate: "group-1-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 group name is empty. Please specify a valid value."), + }, + { + name: "A worker group spec without a template", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 compute template is empty. Please specify a valid value."), + }, + { + name: "A worker group spec with 0 max replicas", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a template", + MaxReplicas: 0, + }, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 MaxReplicas can not be 0. Please specify a valid value."), + }, + { + name: "A worker group spec with invalid min replicas", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a template", + MinReplicas: 5, + MaxReplicas: 1, + }, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 MinReplica > MaxReplicas. Please specify a valid value."), + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.name, func(t *testing.T) { + actualError := server.ValidateClusterSpec(tc.clusterSpec) + if tc.expectedError == nil { + require.NoError(t, actualError, "No error expected.") + } else { + require.EqualError(t, actualError, tc.expectedError.Error(), "A matching error is expected") + } + }) + } +} + +func TestValidateCreateServiceRequest(t *testing.T) { + tests := []struct { + name string + request *api.CreateRayServiceRequest + expectedError error + }{ + { + name: "A valid create service request V2", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeConfig_V2: "some yaml", + ServiceUnhealthySecondThreshold: 900, + DeploymentUnhealthySecondThreshold: 300, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a compute template name", + EnableIngress: false, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + }, + Namespace: "a-namespace", + }, + expectedError: nil, + }, + { + name: "A valid create service request V1", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + UserConfig: "price: 2", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + }, + }, + ServiceUnhealthySecondThreshold: 900, + DeploymentUnhealthySecondThreshold: 300, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a compute template name", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + }, + Namespace: "a-namespace", + }, + expectedError: nil, + }, + { + name: "A nil name create service request", + request: nil, + expectedError: util.NewInvalidInputError("A non nill request is expected"), + }, + { + name: "An empty create service request", + request: &api.CreateRayServiceRequest{}, + expectedError: util.NewInvalidInputError("Namespace is empty. Please specify a valid value."), + }, + { + name: "A create service request with a nill service spec", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: nil, + }, + expectedError: util.NewInvalidInputError("Service is empty, please input a valid payload."), + }, + { + name: "A create service request with mismatching namespaces", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Namespace: "another-namespace", + }, + }, + expectedError: util.NewInvalidInputError("The namespace in the request is different from the namespace in the service definition."), + }, + { + name: "A create service request with no name", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Namespace: "a-namespace", + }, + }, + expectedError: util.NewInvalidInputError("Service name is empty. Please specify a valid value."), + }, + { + name: "A create service request with no user name", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Namespace: "a-namespace", + Name: "fruit-stand", + User: "", + }, + }, + expectedError: util.NewInvalidInputError("User who create the Service is empty. Please specify a valid value."), + }, + { + name: "A create service with no service graph or V2 config", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Namespace: "a-namespace", + Name: "fruit-stand", + User: "3cp0", + }, + }, + expectedError: util.NewInvalidInputError("A serve config v2 or deployment graph specs is required. Please specify either."), + }, + { + name: "A create service request with both V1 and graph spec", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeConfig_V2: "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + UserConfig: "price: 2", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + }, + }, + ServiceUnhealthySecondThreshold: 900, + DeploymentUnhealthySecondThreshold: 300, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a compute template name", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + }, + Namespace: "a-namespace", + }, + expectedError: util.NewInvalidInputError("Both serve config v2 or deployment graph specs were specified. Please specify one or the other."), + }, + { + name: "A create request with no cluster spec", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeConfig_V2: "some yaml", + ServiceUnhealthySecondThreshold: 900, + DeploymentUnhealthySecondThreshold: 300, + ClusterSpec: nil, + }, + Namespace: "a-namespace", + }, + expectedError: util.NewInvalidInputError("A ClusterSpec object is required. Please specify one."), + }, + { + name: "A create request with empty deployment graph spec", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{}, + ClusterSpec: nil, + }, + Namespace: "a-namespace", + }, + expectedError: util.NewInvalidInputError("ServeDeploymentGraphSpec import path must have a value. Please specify valid value."), + }, + { + name: "A create request with a invalid deployment graph spec empty serve config", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + ServeConfigs: []*api.ServeConfig{ + {}, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("ServeConfig 0 deployment name is empty. Please specify a valid value."), + }, + { + name: "A create request with a invalid deployment graph spec no replicas serve config", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: -1, + }, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("ServeConfig 0 replicas must be greater than 0. Please specify a valid value."), + }, + { + name: "A create request with a invalid deployment graph spec with invalid actor options", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + ActorOptions: &api.ActorOptions{ + CpusPerActor: -1, + GpusPerActor: -1, + MemoryPerActor: 0, + }, + }, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("ServeConfig 0 invalid ActorOptions, cpusPerActor, gpusPerActor and memoryPerActor must be greater than 0."), + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.name, func(t *testing.T) { + actualError := server.ValidateCreateServiceRequest(tc.request) + if tc.expectedError == nil { + require.NoError(t, actualError, "No error expected.") + } else { + require.EqualError(t, actualError, tc.expectedError.Error(), "A matching error is expected") + } + }) + } +} + +func TestValidateUpdateRayServiceConfigsRequest(t *testing.T) { + tests := []struct { + name string + request *api.UpdateRayServiceConfigsRequest + expectedError error + }{ + { + name: "A valid update request", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{ + { + GroupName: "a-group-name", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 2, + }, + }, + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + UserConfig: "price: 2", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + }, + }, + }, + }, + expectedError: nil, + }, + { + name: "A valid update request with only workgroup update spec", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{ + { + GroupName: "a group name", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 2, + }, + }, + ServeDeploymentGraphSpec: nil, + }, + }, + expectedError: nil, + }, + { + name: "A valid update request with only deployment graph spec", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: nil, + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + UserConfig: "price: 2", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + }, + }, + }, + }, + expectedError: nil, + }, + { + name: "An empty request", + request: &api.UpdateRayServiceConfigsRequest{}, + expectedError: util.NewInvalidInputError("Update ray service config request ray service name is empty. Please specify a valid value."), + }, + { + name: "A nil request", + request: nil, + expectedError: util.NewInvalidInputError("Update ray service config request can't be nil."), + }, + { + name: "A no namespace request", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "", + UpdateService: &api.UpdateRayServiceBody{}, + }, + expectedError: util.NewInvalidInputError("Update ray service config request ray service namespace is empty. Please specify a valid value."), + }, + { + name: "A no service name request", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{}, + }, + expectedError: util.NewInvalidInputError("Update ray service config request ray service name is empty. Please specify a valid value."), + }, + { + name: "A nil update ray service body", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: nil, + }, + expectedError: util.NewInvalidInputError("Update ray service config request spec is empty. Nothing to update."), + }, + { + name: "An empty update ray service body", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{}, + }, + expectedError: util.NewInvalidInputError("Update ray service config request spec is empty. Nothing to update."), + }, + { + name: "A worker group spec with no name", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{ + { + GroupName: "", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 2, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("Update ray service config request worker group update spec at index %d is missing a name, Please specify a valid value.", 0), + }, + { + name: "A worker group spec with invalid replica counts", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{ + { + GroupName: "small-wg", + Replicas: 0, + MinReplicas: 0, + MaxReplicas: 0, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("Update ray service config request worker group update spec at index %d has invalid values, replicas, minReplicas and maxReplicas must be greater than 0.", 0), + }, + { + name: "A worker group spec with invalid min max replica counts", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{ + { + GroupName: "small-wg", + Replicas: 1, + MinReplicas: 5, + MaxReplicas: 1, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("Update ray service config request worker group update spec with name 'small-wg' has MinReplica > MaxReplicas. Please specify a valid value."), + }, + { + name: "An empty ServeDeploymentGraphSpec", + request: &api.UpdateRayServiceConfigsRequest{ + Name: "a-service-name", + Namespace: "a-namespace", + UpdateService: &api.UpdateRayServiceBody{ + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{}, + }, + }, + expectedError: util.NewInvalidInputError("ServeDeploymentGraphSpec import path must have a value. Please specify valid value."), + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.name, func(t *testing.T) { + actualError := server.ValidateUpdateRayServiceConfigsRequest(tc.request) + if tc.expectedError == nil { + require.NoError(t, actualError, "No error expected.") + } else { + require.EqualError(t, actualError, tc.expectedError.Error(), "A matching error is expected") + } + }) + } +} diff --git a/apiserver/test/e2e/cluster_server_e2e_test.go b/apiserver/test/e2e/cluster_server_e2e_test.go new file mode 100644 index 0000000000..ee3c56fc7e --- /dev/null +++ b/apiserver/test/e2e/cluster_server_e2e_test.go @@ -0,0 +1,675 @@ +package e2e + +import ( + "net/http" + "testing" + "time" + + kuberayHTTP "github.com/ray-project/kuberay/apiserver/pkg/http" + api "github.com/ray-project/kuberay/proto/go_client" + + rayv1api "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/util/wait" +) + +// TestCreateClusterEndpoint sequentially iterates over the create cluster endpoint +// with valid and invalid requests +func TestCreateClusterEndpoint(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + // create config map and register a cleanup hook upon success + configMapName := tCtx.CreateConfigMap(t, map[string]string{ + "counter_sample.py": ReadFileAsString(t, "resources/counter_sample.py"), + }) + t.Cleanup(func() { + tCtx.DeleteConfigMap(t, configMapName) + }) + + tests := []GenericEnd2EndTest[*api.CreateClusterRequest]{ + { + Name: "Create a cluster without volumes", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "3cpo", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: map[string]string{ + "node-ip-address": "$MY_POD_IP", + }, + }, + }, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Create cluster with config map volume", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "boris", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{ + { + MountPath: "/home/ray/samples", + VolumeType: api.Volume_CONFIGMAP, + Name: "code-sample", + Source: tCtx.GetConfigMapName(), + Items: map[string]string{ + "counter_sample.py": "counter_sample.py", + }, + }, + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: map[string]string{ + "node-ip-address": "$MY_POD_IP", + }, + Volumes: []*api.Volume{ + { + MountPath: "/home/ray/samples", + VolumeType: api.Volume_CONFIGMAP, + Name: "code-sample", + Source: tCtx.GetConfigMapName(), + Items: map[string]string{ + "counter_sample.py": "counter_sample.py", + }, + }, + }, + }, + }, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Create cluster with no workers", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "boris", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{ + { + MountPath: "/home/ray/samples", + VolumeType: api.Volume_CONFIGMAP, + Name: "code-sample", + Source: tCtx.GetConfigMapName(), + Items: map[string]string{ + "counter_sample.py": "counter_sample.py", + }, + }, + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Create cluster with no namespace in request", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{}, + Namespace: "", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Create cluster with no cluster object", + Input: &api.CreateClusterRequest{ + Cluster: nil, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with no namespace in the cluster object", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: "", + User: "", + Version: "", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{}, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with no name in the cluster object", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: "", + Namespace: tCtx.GetNamespaceName(), + User: "", + Version: "", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{}, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with no user name in the cluster object", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "", + Version: "", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{}, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with nil cluster spec in the cluster object", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "bullwinkle", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: nil, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with no head group spec in the cluster object", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "bullwinkle", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{}, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with no compute template in the head group spec", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "boris", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "", + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with no ray start parameters in the head group spec", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "boris", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + RayStartParams: map[string]string{}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with no group name in the worker group spec", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "boris", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + {}, + }, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with no compute template in the worker group spec", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "boris", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: "", + Image: "", + }, + }, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create cluster with invalid replica count in the worker group spec", + Input: &api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "boris", + Version: "2.7.0", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + }, + }, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualCluster, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateCluster(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualCluster, "A cluster is expected") + waitForRunningCluster(t, tCtx, actualCluster.Name) + tCtx.DeleteRayCluster(t, actualCluster.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func TestDeleteCluster(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + tCtx.CreateRayClusterWithConfigMaps(t, map[string]string{ + "counter_sample.py": ReadFileAsString(t, "resources/counter_sample.py"), + "fail_fast.py": ReadFileAsString(t, "resources/fail_fast_sample.py"), + }) + tests := []GenericEnd2EndTest[*api.DeleteClusterRequest]{ + { + Name: "Delete an existing cluster", + Input: &api.DeleteClusterRequest{ + Name: tCtx.GetRayClusterName(), + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Delete a non existing cluster", + Input: &api.DeleteClusterRequest{ + Name: "bogus-cluster-name", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Delete a cluster with no namespace", + Input: &api.DeleteClusterRequest{ + Name: "bogus-cluster-name", + Namespace: "", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Delete a cluster with no name", + Input: &api.DeleteClusterRequest{ + Name: "", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualRpcStatus, err := tCtx.GetRayApiServerClient().DeleteCluster(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + waitForDeletedCluster(t, tCtx, tc.Input.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +// TestGetAllClusters tests gets all Ray clusters from k8s cluster +func TestGetAllClusters(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + actualCluster, confiMapName := tCtx.CreateRayClusterWithConfigMaps(t, map[string]string{ + "counter_sample.py": ReadFileAsString(t, "resources/counter_sample.py"), + "fail_fast.py": ReadFileAsString(t, "resources/fail_fast_sample.py"), + }) + t.Cleanup(func() { + tCtx.DeleteRayCluster(t, actualCluster.Name) + tCtx.DeleteConfigMap(t, confiMapName) + }) + + response, actualRpcStatus, err := tCtx.GetRayApiServerClient().ListAllClusters() + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, response, "A response is expected") + require.NotEmpty(t, response.Clusters, "A list of clusters is required") + require.Equal(t, tCtx.GetRayClusterName(), response.Clusters[0].Name) + require.Equal(t, tCtx.GetNamespaceName(), response.Clusters[0].Namespace) +} + +// TestGetClustersInNamespace validates t +func TestGetClustersInNamespace(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + cluster, configMapName := tCtx.CreateRayClusterWithConfigMaps(t, map[string]string{ + "counter_sample.py": ReadFileAsString(t, "resources/counter_sample.py"), + "fail_fast.py": ReadFileAsString(t, "resources/fail_fast_sample.py"), + }) + t.Cleanup(func() { + tCtx.DeleteRayCluster(t, cluster.Name) + tCtx.DeleteConfigMap(t, configMapName) + }) + + response, actualRpcStatus, err := tCtx.GetRayApiServerClient().ListClusters( + &api.ListClustersRequest{ + Namespace: tCtx.GetNamespaceName(), + }) + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, response, "A response is expected") + require.NotEmpty(t, response.Clusters, "A list of compute templates is required") + require.Equal(t, tCtx.GetRayClusterName(), response.Clusters[0].Name) + require.Equal(t, tCtx.GetNamespaceName(), response.Clusters[0].Namespace) +} + +// TestDeleteTemplate sequentially iterates over the delete compute template endpoint +// to validate various input scenarios +func TestGetClustersByNameInNamespace(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + cluster, configMapName := tCtx.CreateRayClusterWithConfigMaps(t, map[string]string{ + "counter_sample.py": ReadFileAsString(t, "resources/counter_sample.py"), + "fail_fast.py": ReadFileAsString(t, "resources/fail_fast_sample.py"), + }) + t.Cleanup(func() { + tCtx.DeleteRayCluster(t, cluster.Name) + tCtx.DeleteConfigMap(t, configMapName) + }) + + tests := []GenericEnd2EndTest[*api.GetClusterRequest]{ + { + Name: "Get cluster by name in a namespace", + Input: &api.GetClusterRequest{ + Name: tCtx.GetRayClusterName(), + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Get non existing cluster", + Input: &api.GetClusterRequest{ + Name: "a-bogus-cluster-name", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Get a cluster with no Name", + Input: &api.GetClusterRequest{ + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Get a cluster with no namespace", + Input: &api.GetClusterRequest{ + Name: "some-Name", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualCluster, actualRpcStatus, err := tCtx.GetRayApiServerClient().GetCluster(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.Equal(t, tCtx.GetRayClusterName(), actualCluster.Name) + require.Equal(t, tCtx.GetNamespaceName(), actualCluster.Namespace) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func waitForRunningCluster(t *testing.T, tCtx *End2EndTestingContext, clusterName string) { + // wait for the cluster to be in a running state for 3 minutes + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayCluster, err00 := tCtx.GetRayClusterByName(clusterName) + if err00 != nil { + return true, err00 + } + t.Logf("Found cluster state of '%s' for ray cluster '%s'", rayCluster.Status.State, clusterName) + return rayCluster.Status.State == rayv1api.Ready, nil + }) + require.NoErrorf(t, err, "No error expected when getting ray cluster: '%s', err %v", tCtx.GetRayClusterName(), err) +} + +func waitForDeletedCluster(t *testing.T, tCtx *End2EndTestingContext, clusterName string) { + // wait for the cluster to be deleted + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayCluster, err00 := tCtx.GetRayClusterByName(clusterName) + if err00 != nil && + assert.EqualError(t, err00, "rayclusters.ray.io \""+tCtx.GetRayClusterName()+"\" not found") { + return true, nil + } + t.Logf("Found status of '%s' for ray cluster '%s'", rayCluster.Status.State, clusterName) + return false, err00 + }) + require.NoErrorf(t, err, "No error expected when deleting ray cluster: '%s', err %v", clusterName, err) +} diff --git a/apiserver/test/e2e/config_server_e2e_test.go b/apiserver/test/e2e/config_server_e2e_test.go new file mode 100644 index 0000000000..176faeb3dd --- /dev/null +++ b/apiserver/test/e2e/config_server_e2e_test.go @@ -0,0 +1,299 @@ +package e2e + +import ( + "net/http" + "reflect" + "testing" + + kuberayHTTP "github.com/ray-project/kuberay/apiserver/pkg/http" + api "github.com/ray-project/kuberay/proto/go_client" + "github.com/stretchr/testify/require" +) + +// TestCreateTemplate sequentially iterates over the create compute endpoint +// to validate various input scenarios +func TestCreateTemplate(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + tests := []GenericEnd2EndTest[*api.CreateComputeTemplateRequest]{ + { + Name: "Create a valid compute template", + Input: &api.CreateComputeTemplateRequest{ + ComputeTemplate: &api.ComputeTemplate{ + Name: tCtx.GetComputeTemplateName(), + Namespace: tCtx.GetNamespaceName(), + Cpu: 2, + Memory: 4, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Create an invalid template with no Name", + Input: &api.CreateComputeTemplateRequest{ + ComputeTemplate: &api.ComputeTemplate{ + Name: "", + Namespace: tCtx.GetNamespaceName(), + Cpu: 2, + Memory: 4, + Gpu: 0, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create an invalid template with different namespace", + Input: &api.CreateComputeTemplateRequest{ + ComputeTemplate: &api.ComputeTemplate{ + Name: tCtx.GetComputeTemplateName(), + Namespace: "another", + Cpu: 2, + Memory: 4, + Gpu: 0, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create an invalid template with zero cpu", + Input: &api.CreateComputeTemplateRequest{ + ComputeTemplate: &api.ComputeTemplate{ + Name: tCtx.GetComputeTemplateName(), + Namespace: tCtx.GetNamespaceName(), + Cpu: 0, + Memory: 4, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create an invalid template with zero memory", + Input: &api.CreateComputeTemplateRequest{ + ComputeTemplate: &api.ComputeTemplate{ + Name: tCtx.GetComputeTemplateName(), + Namespace: tCtx.GetNamespaceName(), + Cpu: 2, + Memory: 0, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a duplicate invalid", + Input: &api.CreateComputeTemplateRequest{ + ComputeTemplate: &api.ComputeTemplate{ + Name: tCtx.GetComputeTemplateName(), + Namespace: tCtx.GetNamespaceName(), + Cpu: 2, + Memory: 0, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualTemplate, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateComputeTemplate(tc.Input) + if tc.ExpectedError != nil { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } else { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.Truef(t, reflect.DeepEqual(tc.Input.ComputeTemplate, actualTemplate), "Equal templates expected") + } + }) + } +} + +// TestDeleteTemplate sequentially iterates over the delete compute template endpoint +// to validate various input scenarios +func TestDeleteTemplate(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + + tests := []GenericEnd2EndTest[*api.DeleteComputeTemplateRequest]{ + { + Name: "Delete existing template", + Input: &api.DeleteComputeTemplateRequest{ + Name: tCtx.GetComputeTemplateName(), + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Delete non existing template", + Input: &api.DeleteComputeTemplateRequest{ + Name: "another-template", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Delete a template with no name", + Input: &api.DeleteComputeTemplateRequest{ + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Delete a template with no namespace", + Input: &api.DeleteComputeTemplateRequest{ + Name: "some-name", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualRpcStatus, err := tCtx.GetRayApiServerClient().DeleteComputeTemplate(tc.Input) + if tc.ExpectedError != nil { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } else { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + } + }) + } +} + +// TestGetAllTheTemplates tests gets all compute templates endpoint +// to validate various input scenarios +func TestGetAllComputeTemplates(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + response, actualRpcStatus, err := tCtx.GetRayApiServerClient().GetAllComputeTemplates() + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, response, "A response is expected") + require.NotEmpty(t, response.ComputeTemplates, "A list of compute templates is required") + require.Equal(t, tCtx.GetComputeTemplateName(), response.ComputeTemplates[0].Name) + require.Equal(t, tCtx.GetNamespaceName(), response.ComputeTemplates[0].Namespace) +} + +// TestGetTemplatesInNamespace get all compute templates in namespace endpoint +// to validate various input scenarios +func TestGetTemplatesInNamespace(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + response, actualRpcStatus, err := tCtx.GetRayApiServerClient().GetAllComputeTemplatesInNamespace( + &api.ListComputeTemplatesRequest{ + Namespace: tCtx.GetNamespaceName(), + }) + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, response, "A response is expected") + require.NotEmpty(t, response.ComputeTemplates, "A list of compute templates is required") + require.Equal(t, tCtx.GetComputeTemplateName(), response.ComputeTemplates[0].Name) + require.Equal(t, tCtx.GetNamespaceName(), response.ComputeTemplates[0].Namespace) +} + +// TestDeleteTemplate sequentially iterates over the delete compute template endpoint +// to validate various input scenarios +func TestGetTemplateByNameInNamespace(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + + tests := []GenericEnd2EndTest[*api.GetComputeTemplateRequest]{ + { + Name: "Get template by Name in a namespace", + Input: &api.GetComputeTemplateRequest{ + Name: tCtx.GetComputeTemplateName(), + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Get non existing template", + Input: &api.GetComputeTemplateRequest{ + Name: "another-template", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Get a template with no Name", + Input: &api.GetComputeTemplateRequest{ + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Get a template with no namespace", + Input: &api.GetComputeTemplateRequest{ + Name: "some-Name", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualTemplate, actualRpcStatus, err := tCtx.GetRayApiServerClient().GetComputeTemplate(tc.Input) + if tc.ExpectedError != nil { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } else { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.Equal(t, tCtx.GetComputeTemplateName(), actualTemplate.Name) + require.Equal(t, tCtx.GetNamespaceName(), actualTemplate.Namespace) + } + }) + } +} diff --git a/apiserver/test/e2e/doc.go b/apiserver/test/e2e/doc.go new file mode 100644 index 0000000000..e065bde842 --- /dev/null +++ b/apiserver/test/e2e/doc.go @@ -0,0 +1,15 @@ +// Package e2e provides test functions, utility function and structs that allow for integration testing +// of Kuberay API server and Kuberay operator. +// +// The code assumes that cluster found in [~/.kube/config] up and has the needed components (Kuberay API server +// Kuberay Operator) deployed and functional. +// +// The code is organized as follows: +// +// - types.go -- provides for data types +// - utils.go -- provides for utility functions +// - cluster_server_e2e_test.go -- provides the test function for the Cluster GRPC Server +// - config_server_e2e_test.go -- provides the test function for the Config GRPC Server +// - job_server_e2e_test.go -- provides the test function for the Job GRPC Server +// - service_server_e2e_test.go -- provides the test function +package e2e diff --git a/apiserver/test/e2e/job_server_e2e_test.go b/apiserver/test/e2e/job_server_e2e_test.go new file mode 100644 index 0000000000..aa44b39e99 --- /dev/null +++ b/apiserver/test/e2e/job_server_e2e_test.go @@ -0,0 +1,581 @@ +package e2e + +import ( + "net/http" + "testing" + "time" + + rayv1api "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/util/wait" + + kuberayHTTP "github.com/ray-project/kuberay/apiserver/pkg/http" + api "github.com/ray-project/kuberay/proto/go_client" +) + +func TestCreateJobWithDisposableClusters(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + // create config map and register a cleanup hook upon success + configMapName := tCtx.CreateConfigMap(t, map[string]string{ + "counter_sample.py": ReadFileAsString(t, "resources/counter_sample.py"), + "fail_fast.py": ReadFileAsString(t, "resources/fail_fast_sample.py"), + }) + t.Cleanup(func() { + tCtx.DeleteConfigMap(t, configMapName) + }) + + items := map[string]string{ + "counter_sample.py": "counter_sample.py", + "fail_fast.py": "fail_fast_sample.py", + } + + startParams := map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + } + volume := &api.Volume{ + MountPath: "/home/ray/samples", + VolumeType: api.Volume_CONFIGMAP, + Name: "code-sample", + Source: tCtx.GetConfigMapName(), + Items: items, + } + + clusterSpec := &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + EnableIngress: false, + RayStartParams: startParams, + Volumes: []*api.Volume{volume}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: startParams, + Volumes: []*api.Volume{volume}, + }, + }, + } + + tests := []struct { + Name string + Input *api.CreateRayJobRequest + ExpectedError error + ExpectedJobStatus rayv1api.JobStatus + }{ + { + Name: "Create a running sample job", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "natacha", + Entrypoint: "python /home/ray/samples/counter_sample.py", + RuntimeEnv: "pip:\n - requests==2.26.0\n - pendulum==2.1.2\nenv_vars:\n counter_name: test_counter\n", + ShutdownAfterJobFinishes: true, + ClusterSpec: clusterSpec, + TtlSecondsAfterFinished: 60, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + ExpectedJobStatus: rayv1api.JobStatusSucceeded, + }, + { + Name: "Create a failing sample job", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "natacha", + Entrypoint: "python /home/ray/samples/fail_fast.py", + ShutdownAfterJobFinishes: true, + ClusterSpec: clusterSpec, + TtlSecondsAfterFinished: 60, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + ExpectedJobStatus: rayv1api.JobStatusFailed, + }, + { + Name: "Create a job request without providing a namespace", + Input: &api.CreateRayJobRequest{ + Job: nil, + Namespace: "", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Create a job request with nil job spec", + Input: &api.CreateRayJobRequest{ + Job: nil, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a job request with no namespace in the job spec", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Name: tCtx.GetNextName(), + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a job request with no name", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Namespace: tCtx.GetNamespaceName(), + Name: "", + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a job request with no user name", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetNextName(), + User: "", + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a job request with entrypoint", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetNextName(), + User: "bullwinkle", + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a job request with nil cluster spec", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetNextName(), + User: "bullwinkle", + Entrypoint: "python /home/ray/samples/counter_sample.py", + RuntimeEnv: "pip:\n - requests==2.26.0\n - pendulum==2.1.2\nenv_vars:\n counter_name: test_counter\n", + ShutdownAfterJobFinishes: true, + ClusterSpec: nil, + TtlSecondsAfterFinished: 60, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualJob, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateRayJob(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualJob, "A job is expected") + waitForRayJob(t, tCtx, tc.Input.Job.Name, tc.ExpectedJobStatus) + tCtx.DeleteRayJobByName(t, actualJob.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func TestDeleteJob(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testJobRequest := createTestJob(t, tCtx) + + tests := []GenericEnd2EndTest[*api.DeleteRayJobRequest]{ + { + Name: "Delete an existing job", + Input: &api.DeleteRayJobRequest{ + Name: testJobRequest.Job.Name, + Namespace: testJobRequest.Namespace, + }, + ExpectedError: nil, + }, + { + Name: "Delete a non existing job", + Input: &api.DeleteRayJobRequest{ + Name: "a-bogus-job-name", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Delete a job without providing a namespace", + Input: &api.DeleteRayJobRequest{ + Name: testJobRequest.Job.Name, + Namespace: "", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualRpcStatus, err := tCtx.GetRayApiServerClient().DeleteRayJob(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + waitForDeletedRayJob(t, tCtx, testJobRequest.Job.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func TestGetAllJobs(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testJobRequest := createTestJob(t, tCtx) + t.Cleanup(func() { + tCtx.DeleteRayJobByName(t, testJobRequest.Job.Name) + }) + + response, actualRpcStatus, err := tCtx.GetRayApiServerClient().ListAllRayJobs() + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, response, "A response is expected") + require.NotEmpty(t, response.Jobs, "A list of jobs is required") + require.Equal(t, testJobRequest.Job.Name, response.Jobs[0].Name) + require.Equal(t, tCtx.GetNamespaceName(), response.Jobs[0].Namespace) +} + +func TestGetJobsInNamespace(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testJobRequest := createTestJob(t, tCtx) + t.Cleanup(func() { + tCtx.DeleteRayJobByName(t, testJobRequest.Job.Name) + }) + + response, actualRpcStatus, err := tCtx.GetRayApiServerClient().ListRayJobs(&api.ListRayJobsRequest{ + Namespace: tCtx.GetNamespaceName(), + }) + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, response, "A response is expected") + require.NotEmpty(t, response.Jobs, "A list of compute templates is required") + require.Equal(t, testJobRequest.Job.Name, response.Jobs[0].Name) + require.Equal(t, tCtx.GetNamespaceName(), response.Jobs[0].Namespace) +} + +func TestGetJob(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testJobRequest := createTestJob(t, tCtx) + t.Cleanup(func() { + tCtx.DeleteRayJobByName(t, testJobRequest.Job.Name) + }) + tests := []GenericEnd2EndTest[*api.GetRayJobRequest]{ + { + Name: "Get job by name in a namespace", + Input: &api.GetRayJobRequest{ + Name: testJobRequest.Job.Name, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Get non existing job", + Input: &api.GetRayJobRequest{ + Name: "a-bogus-cluster-name", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Get a job with no Name", + Input: &api.GetRayJobRequest{ + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Get a job with no namespace", + Input: &api.GetRayJobRequest{ + Name: "some-Name", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualJob, actualRpcStatus, err := tCtx.GetRayApiServerClient().GetRayJob(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.Equal(t, tc.Input.Name, actualJob.Name) + require.Equal(t, tCtx.GetNamespaceName(), actualJob.Namespace) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func TestCreateJobWithClusterSelector(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + cluster, configMapName := tCtx.CreateRayClusterWithConfigMaps(t, + map[string]string{ + "counter_sample.py": ReadFileAsString(t, "resources/counter_sample.py"), + "fail_fast.py": ReadFileAsString(t, "resources/fail_fast_sample.py"), + }) + t.Cleanup(func() { + tCtx.DeleteRayCluster(t, cluster.Name) + tCtx.DeleteConfigMap(t, configMapName) + }) + + tests := []struct { + Name string + Input *api.CreateRayJobRequest + ExpectedError error + ExpectedJobStatus rayv1api.JobStatus + }{ + { + Name: "Submit a correct job on an already running cluster", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "r2d2", + Entrypoint: "python /home/ray/samples/counter_sample.py", + Metadata: map[string]string{}, + RuntimeEnv: "pip:\n - requests==2.26.0\n - pendulum==2.1.2\nenv_vars:\n counter_name: test_counter\n", + ClusterSelector: map[string]string{"ray.io/cluster": cluster.Name}, + TtlSecondsAfterFinished: 60, + JobSubmitter: &api.RayJobSubmitter{ + Image: cluster.ClusterSpec.HeadGroupSpec.Image, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedJobStatus: rayv1api.JobStatusSucceeded, + ExpectedError: nil, + }, + { + Name: "Submit a failing job on an already running cluster", + Input: &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "r2d2", + Entrypoint: "python /home/ray/samples/fail_fast.py", + RuntimeEnv: "pip:\n - requests==2.26.0\n - pendulum==2.1.2\nenv_vars:\n counter_name: test_counter\n", + ShutdownAfterJobFinishes: true, + TtlSecondsAfterFinished: 60, + ClusterSelector: map[string]string{"ray.io/cluster": cluster.Name}, + JobSubmitter: &api.RayJobSubmitter{ + Image: cluster.ClusterSpec.HeadGroupSpec.Image, + }, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedJobStatus: rayv1api.JobStatusFailed, + ExpectedError: nil, + }, + } + // Execute tests sequentially + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + actualJob, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateRayJob(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualJob, "A job is expected") + waitForRayJob(t, tCtx, tc.Input.Job.Name, tc.ExpectedJobStatus) + tCtx.DeleteRayJobByName(t, actualJob.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func createTestJob(t *testing.T, tCtx *End2EndTestingContext) *api.CreateRayJobRequest { + // create config map and register a cleanup hook upon success + configMapName := tCtx.CreateConfigMap(t, map[string]string{ + "counter_sample.py": ReadFileAsString(t, "resources/counter_sample.py"), + "fail_fast.py": ReadFileAsString(t, "resources/fail_fast_sample.py"), + }) + t.Cleanup(func() { + tCtx.DeleteConfigMap(t, configMapName) + }) + + items := map[string]string{ + "counter_sample.py": "counter_sample.py", + "fail_fast.py": "fail_fast_sample.py", + } + + startParams := map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + } + volume := &api.Volume{ + MountPath: "/home/ray/samples", + VolumeType: api.Volume_CONFIGMAP, + Name: "code-sample", + Source: tCtx.GetConfigMapName(), + Items: items, + } + + testJobRequest := &api.CreateRayJobRequest{ + Job: &api.RayJob{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "natacha", + Entrypoint: "python /home/ray/samples/counter_sample.py", + RuntimeEnv: "pip:\n - requests==2.26.0\n - pendulum==2.1.2\nenv_vars:\n counter_name: test_counter\n", + ShutdownAfterJobFinishes: true, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + EnableIngress: false, + RayStartParams: startParams, + Volumes: []*api.Volume{volume}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: startParams, + Volumes: []*api.Volume{volume}, + }, + }, + }, + TtlSecondsAfterFinished: 60, + }, + Namespace: tCtx.GetNamespaceName(), + } + + actualJob, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateRayJob(testJobRequest) + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualJob, "A job is expected") + waitForRayJob(t, tCtx, testJobRequest.Job.Name, rayv1api.JobStatusSucceeded) + return testJobRequest +} + +func waitForRayJob(t *testing.T, tCtx *End2EndTestingContext, rayJobName string, rayJobStatus rayv1api.JobStatus) { + // wait for the job to be in a JobStatusSucceeded state for 3 minutes + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayJob, err00 := tCtx.GetRayJobByName(rayJobName) + if err00 != nil { + return true, err00 + } + t.Logf("Found ray job with state '%s' for ray job '%s'", rayJob.Status.JobStatus, rayJobName) + return rayJob.Status.JobStatus == rayJobStatus, nil + }) + require.NoErrorf(t, err, "No error expected when getting status for ray job: '%s', err %v", rayJobName, err) +} + +func waitForDeletedRayJob(t *testing.T, tCtx *End2EndTestingContext, jobName string) { + // wait for the job to be deleted + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayJob, err00 := tCtx.GetRayJobByName(jobName) + if err00 != nil && + assert.EqualError(t, err00, "rayjobs.ray.io \""+jobName+"\" not found") { + return true, nil + } + t.Logf("Found status of '%s' for ray cluster '%s'", rayJob.Status.JobStatus, jobName) + return false, err00 + }) + require.NoErrorf(t, err, "No error expected when deleting ray job: '%s', err %v", jobName, err) +} diff --git a/apiserver/test/e2e/resources/counter_sample.py b/apiserver/test/e2e/resources/counter_sample.py new file mode 100644 index 0000000000..bd4a85a7bd --- /dev/null +++ b/apiserver/test/e2e/resources/counter_sample.py @@ -0,0 +1,29 @@ + +import ray +import os +import requests + +ray.init() + +@ray.remote +class Counter: + def __init__(self): + # Used to verify runtimeEnv + self.name = os.getenv("counter_name") + assert self.name == "test_counter" + self.counter = 0 + + def inc(self): + self.counter += 1 + + def get_counter(self): + return "{} got {}".format(self.name, self.counter) + +counter = Counter.remote() + +for _ in range(5): + ray.get(counter.inc.remote()) + print(ray.get(counter.get_counter.remote())) + +# Verify that the correct runtime env was used for the job. +assert requests.__version__ == "2.26.0" diff --git a/apiserver/test/e2e/resources/fail_fast_sample.py b/apiserver/test/e2e/resources/fail_fast_sample.py new file mode 100644 index 0000000000..5e5189a84f --- /dev/null +++ b/apiserver/test/e2e/resources/fail_fast_sample.py @@ -0,0 +1,4 @@ +import sys + +print("Something is seriously wrong.", file=sys.stderr) +sys.exit(1) diff --git a/apiserver/test/e2e/service_server_e2e_test.go b/apiserver/test/e2e/service_server_e2e_test.go new file mode 100644 index 0000000000..96281bf1bc --- /dev/null +++ b/apiserver/test/e2e/service_server_e2e_test.go @@ -0,0 +1,910 @@ +package e2e + +import ( + "net/http" + "testing" + "time" + + kuberayHTTP "github.com/ray-project/kuberay/apiserver/pkg/http" + api "github.com/ray-project/kuberay/proto/go_client" + rayv1api "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/util/wait" +) + +var serveConfigs = []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + UserConfig: "price: 2", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + { + DeploymentName: "PearStand", + Replicas: 1, + UserConfig: "price: 1", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + { + DeploymentName: "FruitMarket", + Replicas: 1, + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + { + DeploymentName: "DAGDriver", + Replicas: 1, + RoutePrefix: "/", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, +} + +// TestServiceServerV2 sequentially iterates over the endpoints of the service endpoints using +// V2 configurations (yaml) +func TestCreateServiceV2(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + clusterSpec := &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + EnableIngress: false, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + }, + } + + tests := []GenericEnd2EndTest[*api.CreateRayServiceRequest]{ + { + Name: "Create a fruit stand ray service using V2 configuration", + Input: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "user1", + ServeConfig_V2: "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + ServiceUnhealthySecondThreshold: 10, + DeploymentUnhealthySecondThreshold: 20, + ClusterSpec: clusterSpec, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Create a service request with no namespace value", + Input: &api.CreateRayServiceRequest{ + Service: &api.RayService{}, + Namespace: "", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Create a service request with mismatching namespaces", + Input: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Namespace: "another-namespace-name", + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a service request with no name", + Input: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Namespace: tCtx.GetNamespaceName(), + Name: "", + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a service request with no user", + Input: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetNextName(), + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Create a service request with no cluster spec", + Input: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetNextName(), + ClusterSpec: nil, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualService, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateRayService(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualService, "A service is expected") + waitForRunningService(t, tCtx, actualService.Name) + tCtx.DeleteRayService(t, actualService.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func TestCreateServiceV1(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + startParams := map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + } + clusterSpec := &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + EnableIngress: false, + RayStartParams: startParams, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: startParams, + }, + }, + } + tests := []GenericEnd2EndTest[*api.CreateRayServiceRequest]{ + { + Name: "Create a fruit stand service V1", + Input: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "user1", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: serveConfigs, + }, + ServiceUnhealthySecondThreshold: 300, + DeploymentUnhealthySecondThreshold: 900, + ClusterSpec: clusterSpec, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Create a V1 serve service with empty deployment graph spec", + Input: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "user1", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: serveConfigs, + }, + ServiceUnhealthySecondThreshold: 30, + DeploymentUnhealthySecondThreshold: 90, + ClusterSpec: &api.ClusterSpec{}, + }, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualService, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateRayService(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualService, "A service is expected") + waitForRunningService(t, tCtx, actualService.Name) + tCtx.DeleteRayService(t, actualService.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func TestDeleteService(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testServiceRequest := createTestServiceV2(t, tCtx) + + tests := []GenericEnd2EndTest[*api.DeleteRayServiceRequest]{ + { + Name: "Delete an existing service", + Input: &api.DeleteRayServiceRequest{ + Name: testServiceRequest.Service.Name, + Namespace: testServiceRequest.Namespace, + }, + ExpectedError: nil, + }, + { + Name: "Delete a non existing service", + Input: &api.DeleteRayServiceRequest{ + Name: "a-bogus-job-name", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Delete a service without providing a namespace", + Input: &api.DeleteRayServiceRequest{ + Name: testServiceRequest.Service.Name, + Namespace: "", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualRpcStatus, err := tCtx.GetRayApiServerClient().DeleteRayService(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + waitForDeletedService(t, tCtx, testServiceRequest.Service.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func TestGetAllServices(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testServiceRequest := createTestServiceV2(t, tCtx) + t.Cleanup(func() { + tCtx.DeleteRayService(t, testServiceRequest.Service.Name) + }) + + response, actualRpcStatus, err := tCtx.GetRayApiServerClient().ListAllRayServices() + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, response, "A response is expected") + require.NotEmpty(t, response.Services, "A list of services is required") + require.Equal(t, testServiceRequest.Service.Name, response.Services[0].Name) + require.Equal(t, tCtx.GetNamespaceName(), response.Services[0].Namespace) +} + +func TestGetServicesInNamespace(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testServiceRequest := createTestServiceV2(t, tCtx) + t.Cleanup(func() { + tCtx.DeleteRayService(t, testServiceRequest.Service.Name) + }) + + response, actualRpcStatus, err := tCtx.GetRayApiServerClient().ListRayServices(&api.ListRayServicesRequest{ + Namespace: tCtx.GetNamespaceName(), + }) + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, response, "A response is expected") + require.NotEmpty(t, response.Services, "A list of compute templates is required") + require.Equal(t, testServiceRequest.Service.Name, response.Services[0].Name) + require.Equal(t, tCtx.GetNamespaceName(), response.Services[0].Namespace) +} + +func TestGetService(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testServiceRequest := createTestServiceV2(t, tCtx) + t.Cleanup(func() { + tCtx.DeleteRayService(t, testServiceRequest.Service.Name) + }) + tests := []GenericEnd2EndTest[*api.GetRayServiceRequest]{ + { + Name: "Get job by name in a namespace", + Input: &api.GetRayServiceRequest{ + Name: testServiceRequest.Service.Name, + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: nil, + }, + { + Name: "Get non existing job", + Input: &api.GetRayServiceRequest{ + Name: "a-bogus-cluster-name", + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + { + Name: "Get a job with no Name", + Input: &api.GetRayServiceRequest{ + Namespace: tCtx.GetNamespaceName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Get a job with no namespace", + Input: &api.GetRayServiceRequest{ + Name: "some-Name", + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusNotFound, + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualService, actualRpcStatus, err := tCtx.GetRayApiServerClient().GetRayService(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.Equal(t, tc.Input.Name, actualService.Name) + require.Equal(t, tCtx.GetNamespaceName(), actualService.Namespace) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func TestServiceServerV1Update(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testServiceRequest := createTestServiceV1(t, tCtx) + t.Cleanup(func() { + tCtx.DeleteRayService(t, testServiceRequest.Service.Name) + }) + + tests := []GenericEnd2EndTest[*api.UpdateRayServiceRequest]{ + { + Name: "Update a fruit stand service V1 actor actions with no name", + Input: &api.UpdateRayServiceRequest{ + Service: &api.RayService{ + Name: "", + }, + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetNextName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Update a fruit stand service V1 actor actions with no namespace name", + Input: &api.UpdateRayServiceRequest{ + Service: &api.RayService{ + Name: tCtx.GetNextName(), + }, + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetCurrentName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Update a fruit stand service V1 actor actions with no user", + Input: &api.UpdateRayServiceRequest{ + Service: &api.RayService{ + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetNextName(), + User: "", + }, + Namespace: tCtx.GetNamespaceName(), + Name: tCtx.GetCurrentName(), + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Update a fruit stand service V1 actor actions with nil graph spec", + Input: &api.UpdateRayServiceRequest{ + Service: &api.RayService{ + Namespace: tCtx.GetNamespaceName(), + Name: testServiceRequest.Service.Name, + User: testServiceRequest.Service.User, + }, + Namespace: tCtx.GetNamespaceName(), + Name: testServiceRequest.Service.Name, + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Update a fruit stand service V1 actor actions with empty graph spec", + Input: &api.UpdateRayServiceRequest{ + Service: &api.RayService{ + Namespace: tCtx.GetNamespaceName(), + Name: testServiceRequest.Service.Name, + User: testServiceRequest.Service.User, + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{}, + }, + Namespace: tCtx.GetNamespaceName(), + Name: testServiceRequest.Service.Name, + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Update a fruit stand service V1 actor actions with no cluster spec", + Input: &api.UpdateRayServiceRequest{ + Service: &api.RayService{ + Namespace: testServiceRequest.Service.Namespace, + Name: testServiceRequest.Service.Name, + User: testServiceRequest.Service.User, + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + ServeConfigs: []*api.ServeConfig{}, + }, + }, + Namespace: testServiceRequest.Service.Namespace, + Name: testServiceRequest.Service.Name, + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + //TODO @z103cb this test is failing, needs to be investigated to determine if is a valid test, + // the cluster fails to come up. + /* + { + Name: "Update a fruit stand service V1 actor actions with cluster and no serve configs", + Input: &api.UpdateRayServiceRequest{ + Service: &api.RayService{ + Namespace: testServiceRequest.Service.Namespace, + Name: testServiceRequest.Service.Name, + User: testServiceRequest.Service.User, + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + ServeConfigs: []*api.ServeConfig{}, + }, + ClusterSpec: testServiceRequest.Service.ClusterSpec, + }, + Namespace: testServiceRequest.Service.Namespace, + Name: testServiceRequest.Service.Name, + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + */ + { + Name: "Update a fruit stand service V1 actor actions", + Input: &api.UpdateRayServiceRequest{ + Service: &api.RayService{ + Namespace: testServiceRequest.Service.Namespace, + Name: testServiceRequest.Service.Name, + User: testServiceRequest.Service.User, + ServiceUnhealthySecondThreshold: 90, + DeploymentUnhealthySecondThreshold: 30, + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + UserConfig: "price: 2", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.2, + }, + }, + { + DeploymentName: "PearStand", + Replicas: 1, + UserConfig: "price: 1", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.2, + }, + }, + { + DeploymentName: "FruitMarket", + Replicas: 1, + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.2, + }, + }, + { + DeploymentName: "DAGDriver", + Replicas: 1, + RoutePrefix: "/", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.2, + }, + }, + }, + }, + ClusterSpec: testServiceRequest.Service.ClusterSpec, + }, + Namespace: testServiceRequest.Service.Namespace, + Name: testServiceRequest.Service.Name, + }, + ExpectedError: nil, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualService, actualRpcStatus, err := tCtx.GetRayApiServerClient().UpdateRayService(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualService, "A service is expected") + waitForRunningService(t, tCtx, actualService.Name) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + + } +} + +func TestServiceServerV1Patch(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + testServiceRequest := createTestServiceV1(t, tCtx) + t.Cleanup(func() { + tCtx.DeleteRayService(t, testServiceRequest.Service.Name) + }) + + tests := []GenericEnd2EndTest[*api.UpdateRayServiceConfigsRequest]{ + { + Name: "Update service cluster worker group", + Input: &api.UpdateRayServiceConfigsRequest{ + Name: testServiceRequest.Service.Name, + Namespace: testServiceRequest.Service.Namespace, + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{ + { + GroupName: "small-wg", + Replicas: 2, + MinReplicas: 2, + MaxReplicas: 10, + }, + }, + }, + }, + ExpectedError: nil, + }, + { + Name: "Update service cluster worker group with no values", + Input: &api.UpdateRayServiceConfigsRequest{ + Name: testServiceRequest.Service.Name, + Namespace: testServiceRequest.Service.Namespace, + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{}, + }, + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Update service cluster worker group with no name", + Input: &api.UpdateRayServiceConfigsRequest{ + Name: testServiceRequest.Service.Name, + Namespace: testServiceRequest.Service.Namespace, + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{ + {}, + }, + }, + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + { + Name: "Update service cluster worker group with no name and valid replicas", + Input: &api.UpdateRayServiceConfigsRequest{ + Name: testServiceRequest.Service.Name, + Namespace: testServiceRequest.Service.Namespace, + UpdateService: &api.UpdateRayServiceBody{ + WorkerGroupUpdateSpec: []*api.WorkerGroupUpdateSpec{ + { + GroupName: "", + Replicas: 4, + MinReplicas: 4, + MaxReplicas: 12, + }, + }, + }, + }, + ExpectedError: &kuberayHTTP.KuberayAPIServerClientError{ + HTTPStatusCode: http.StatusBadRequest, + }, + }, + } + + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.Name, func(t *testing.T) { + actualService, actualRpcStatus, err := tCtx.GetRayApiServerClient().UpdateRayServiceConfigs(tc.Input) + if tc.ExpectedError == nil { + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualService, "A service is expected") + waitForRunningServiceWithWorkGroupSpec(t, tCtx, actualService.Name, tc.Input.UpdateService.WorkerGroupUpdateSpec[0].MinReplicas, + tc.Input.UpdateService.WorkerGroupUpdateSpec[0].MaxReplicas, tc.Input.UpdateService.WorkerGroupUpdateSpec[0].Replicas) + } else { + require.EqualError(t, err, tc.ExpectedError.Error(), "Matching error expected") + require.NotNil(t, actualRpcStatus, "A not nill RPC status is required") + } + }) + } +} + +func createTestServiceV2(t *testing.T, tCtx *End2EndTestingContext) *api.CreateRayServiceRequest { + clusterSpec := &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + EnableIngress: false, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + }, + } + + testServiceRequest := &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "user1", + ServeConfig_V2: "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + ServiceUnhealthySecondThreshold: 10, + DeploymentUnhealthySecondThreshold: 20, + ClusterSpec: clusterSpec, + }, + Namespace: tCtx.GetNamespaceName(), + } + actualService, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateRayService(testServiceRequest) + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualService, "A service is expected") + waitForRunningService(t, tCtx, actualService.Name) + + return testServiceRequest +} + +func createTestServiceV1(t *testing.T, tCtx *End2EndTestingContext) *api.CreateRayServiceRequest { + clusterSpec := &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + ServiceType: "NodePort", + EnableIngress: false, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: tCtx.GetComputeTemplateName(), + Image: tCtx.GetRayImage(), + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + }, + } + + testServiceRequest := &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: tCtx.GetNextName(), + Namespace: tCtx.GetNamespaceName(), + User: "user1", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: serveConfigs, + }, + ServiceUnhealthySecondThreshold: 90, + DeploymentUnhealthySecondThreshold: 30, + ClusterSpec: clusterSpec, + }, + Namespace: tCtx.GetNamespaceName(), + } + actualService, actualRpcStatus, err := tCtx.GetRayApiServerClient().CreateRayService(testServiceRequest) + require.NoError(t, err, "No error expected") + require.Nil(t, actualRpcStatus, "No RPC status expected") + require.NotNil(t, actualService, "A service is expected") + waitForRunningService(t, tCtx, actualService.Name) + + return testServiceRequest +} + +func waitForRunningService(t *testing.T, tCtx *End2EndTestingContext, serviceName string) { + // wait for the service to be in a running state for 3 minutes + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayService, err00 := tCtx.GetRayServiceByName(serviceName) + if err00 != nil { + return true, err00 + } + t.Logf("Found status of '%s' for ray service '%s'", rayService.Status.ServiceStatus, serviceName) + return rayService.Status.ServiceStatus == rayv1api.Running, nil + }) + require.NoErrorf(t, err, "No error expected when getting ray service: '%s', err %v", serviceName, err) +} + +func waitForDeletedService(t *testing.T, tCtx *End2EndTestingContext, serviceName string) { + // wait for the service to be deleted + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayService, err00 := tCtx.GetRayServiceByName(serviceName) + if err00 != nil && + assert.EqualError(t, err00, "rayservices.ray.io \""+serviceName+"\" not found") { + return true, nil + } + t.Logf("Found status of '%s' for ray service '%s'", rayService.Status.ServiceStatus, serviceName) + return false, err00 + }) + require.NoErrorf(t, err, "No error expected when deleting ray service: '%s', err %v", serviceName, err) +} + +func waitForRunningServiceWithWorkGroupSpec(t *testing.T, tCtx *End2EndTestingContext, serviceName string, minWorkerReplicas, maxWorkerReplicas, availableWorkerReplicas int32) { + // wait for the service to be in a running state for 3 minutes + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayService, err00 := tCtx.GetRayServiceByName(serviceName) + if err00 != nil { + return true, err00 + } + t.Logf("Found status of '%s' for ray service '%s'", rayService.Status.ServiceStatus, serviceName) + if rayService.Status.ServiceStatus == rayv1api.Running { + rayCluster, err := tCtx.GetRayClusterByName(rayService.Status.ActiveServiceStatus.RayClusterName) + require.NoErrorf(t, err, "Expecting no error when getting cluster named: '%s'", + rayService.Status.ActiveServiceStatus.RayClusterName) + t.Logf("Ray service cluster state is: MinWorkerReplicas = %d && MaxWorkerReplicas == %d && AvailableWorkerReplicas == %d", + rayCluster.Status.MinWorkerReplicas, rayCluster.Status.MaxWorkerReplicas, rayCluster.Status.AvailableWorkerReplicas) + if rayCluster.Status.MinWorkerReplicas == minWorkerReplicas && + rayCluster.Status.MaxWorkerReplicas == maxWorkerReplicas && + rayCluster.Status.AvailableWorkerReplicas == availableWorkerReplicas { + return true, nil + } + } + return false, nil + }) + require.NoErrorf(t, err, "No error expected when getting ray service: '%s', err %v", serviceName, err) +} diff --git a/apiserver/test/e2e/types.go b/apiserver/test/e2e/types.go new file mode 100644 index 0000000000..d68a7ea6d0 --- /dev/null +++ b/apiserver/test/e2e/types.go @@ -0,0 +1,415 @@ +package e2e + +import ( + "context" + "net/http" + "os" + "runtime" + "strings" + "testing" + "time" + + petnames "github.com/dustinkirkland/golang-petname" + kuberayHTTP "github.com/ray-project/kuberay/apiserver/pkg/http" + api "github.com/ray-project/kuberay/proto/go_client" + rayv1api "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + rayv1 "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/typed/ray/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/proto" + batchv1 "k8s.io/api/batch/v1" + v1 "k8s.io/api/core/v1" + k8sApiErrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/client/config" +) + +// GenericEnd2EndTest struct allows for reuse in setting up and running tests +type GenericEnd2EndTest[I proto.Message] struct { + Name string + Input I + ExpectedError error +} + +// End2EndTestingContext provides a common set of values and methods that +// can be used in executing the tests +type End2EndTestingContext struct { + ctx context.Context + apiServerHttpClient *http.Client + kuberayAPIServerClient *kuberayHTTP.KuberayAPIServerClient + rayClient rayv1.RayV1Interface + k8client *kubernetes.Clientset + apiServerBaseURL string + rayImage string + namespaceName string + computeTemplateName string + clusterName string + configMapName string + currentName string +} + +// contextOption is a functional option that allows for building out an instance +// of *End2EndTestingContext +type contextOption func(t *testing.T, tCtx *End2EndTestingContext) error + +// NewEnd2EndTestingContext constructs a *End2EndTestingContext +func NewEnd2EndTestingContext(t *testing.T) (*End2EndTestingContext, error) { + petnames.NonDeterministicMode() + // ordering is important as there dependencies between field values + return newEnd2EndTestingContext(t, + withRayImage(), + withBaseURL(), + withHttpClient(), + withContext(), + withK8sClient(), + withRayClient(), + withNamespace(), + ) +} + +func newEnd2EndTestingContext(t *testing.T, options ...contextOption) (*End2EndTestingContext, error) { + testingContext := &End2EndTestingContext{ + namespaceName: petnames.Generate(2, "-"), + computeTemplateName: petnames.Name(), + clusterName: petnames.Name(), + configMapName: petnames.Generate(2, "-"), + currentName: petnames.Name(), + } + for _, o := range options { + err := o(t, testingContext) + if err != nil { + return nil, err + } + } + return testingContext, nil +} + +func withHttpClient() contextOption { + return func(_ *testing.T, testingContext *End2EndTestingContext) error { + testingContext.apiServerHttpClient = &http.Client{Timeout: time.Duration(1) * time.Second} + testingContext.kuberayAPIServerClient = kuberayHTTP.NewKuberayAPIServerClient(testingContext.apiServerBaseURL, testingContext.apiServerHttpClient) + return nil + } +} + +func withContext() contextOption { + return func(_ *testing.T, testingContext *End2EndTestingContext) error { + testingContext.ctx = context.Background() + return nil + } +} + +func withBaseURL() contextOption { + return func(_ *testing.T, testingContext *End2EndTestingContext) error { + baseURL := os.Getenv("E2E_API_SERVER_URL") + if strings.TrimSpace(baseURL) == "" { + baseURL = "http://localhost:31888" + } + testingContext.apiServerBaseURL = baseURL + return nil + } +} + +func withRayImage() contextOption { + return func(_ *testing.T, testingContext *End2EndTestingContext) error { + rayImage := os.Getenv("E2E_API_SERVER_RAY_IMAGE") + if strings.TrimSpace(rayImage) == "" { + rayImage = "rayproject/ray:2.7.0-py310" + } + // detect if we are running on arm64 machine, most likely apple silicon + // the os name is not checked as it also possible that it might be linux + // also check if the image does not have the `-aarch64` suffix + if runtime.GOARCH == "arm64" && !strings.HasSuffix(rayImage, "-aarch64") { + rayImage = rayImage + "-aarch64" + } + testingContext.rayImage = rayImage + return nil + } +} + +func withK8sClient() contextOption { + return func(t *testing.T, testingContext *End2EndTestingContext) error { + cfg, err := config.GetConfig() + require.NoError(t, err, "No error expected when getting k8s client configuration") + clientSet, err := kubernetes.NewForConfig(cfg) + require.NoError(t, err, "No error expected when creating k8s client") + testingContext.k8client = clientSet + return nil + } +} + +func withNamespace() contextOption { + return func(t *testing.T, tCtx *End2EndTestingContext) error { + require.NotNil(t, tCtx.k8client, "A k8s client must be created prior to creating a namespace") + require.NotNil(t, tCtx.ctx, "A context must exist prior to creating a namespace") + require.NotEmpty(t, tCtx.namespaceName, "Namespace name must be set prior to creating a namespace") + nsName := &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: tCtx.namespaceName, + }, + } + + _, err := tCtx.k8client.CoreV1().Namespaces().Create(tCtx.ctx, nsName, metav1.CreateOptions{}) + require.NoErrorf(t, err, "Expected to create a namespace '%s", nsName.ObjectMeta.Name) + + // register an automatic deletion of the namespace at test's end + t.Cleanup(func() { + err := tCtx.k8client.CoreV1().Namespaces().Delete(tCtx.ctx, tCtx.namespaceName, metav1.DeleteOptions{}) + assert.NoErrorf(t, err, "No error expected when deleting namespace '%s'", tCtx.namespaceName) + }) + return nil + } +} + +func withRayClient() contextOption { + return func(t *testing.T, tCtx *End2EndTestingContext) error { + cfg, err := config.GetConfig() + require.NoError(t, err, "No error expected when getting k8s client configuration") + tCtx.rayClient, err = rayv1.NewForConfig(cfg) + require.NoError(t, err, "No error expected when getting a ray") + return nil + } +} + +func (e2etc *End2EndTestingContext) GetRayServiceByName(serviceName string) (*rayv1api.RayService, error) { + return e2etc.rayClient.RayServices(e2etc.namespaceName).Get(e2etc.ctx, serviceName, metav1.GetOptions{}) +} + +//func (e2etc *End2EndTestingContext) GetRayCluster() (*rayv1api.RayCluster, error) { +// return e2etc.rayClient.RayClusters(e2etc.namespaceName).Get(e2etc.ctx, e2etc.clusterName, metav1.GetOptions{}) +//} + +func (e2etc *End2EndTestingContext) GetRayClusterByName(clusterName string) (*rayv1api.RayCluster, error) { + return e2etc.rayClient.RayClusters(e2etc.namespaceName).Get(e2etc.ctx, clusterName, metav1.GetOptions{}) +} + +func (e2etc *End2EndTestingContext) GetBatchV1JobByName(jobName string) (*batchv1.Job, error) { + return e2etc.k8client.BatchV1().Jobs(e2etc.namespaceName).Get(e2etc.ctx, jobName, metav1.GetOptions{}) +} + +func (e2etc *End2EndTestingContext) GetRayClusterName() string { + return e2etc.clusterName +} + +func (e2etc *End2EndTestingContext) GetRayJobByName(rayJobName string) (*rayv1api.RayJob, error) { + return e2etc.rayClient.RayJobs(e2etc.namespaceName).Get(e2etc.ctx, rayJobName, metav1.GetOptions{}) +} + +func (e2etc *End2EndTestingContext) GetConfigMapName() string { + return e2etc.configMapName +} + +func (e2etc *End2EndTestingContext) GetNamespaceName() string { + return e2etc.namespaceName +} + +func (e2etc *End2EndTestingContext) GetComputeTemplateName() string { + return e2etc.computeTemplateName +} + +func (e2etc *End2EndTestingContext) GetRayImage() string { + return e2etc.rayImage +} + +func (e2etc *End2EndTestingContext) GetRayApiServerClient() *kuberayHTTP.KuberayAPIServerClient { + return e2etc.kuberayAPIServerClient +} + +func (e2etc *End2EndTestingContext) GetNextName() string { + e2etc.currentName = petnames.Name() + return e2etc.currentName +} + +func (e2etc *End2EndTestingContext) GetCurrentName() string { + return e2etc.currentName +} + +func (e2etc *End2EndTestingContext) CreateComputeTemplate(t *testing.T) { + computeTemplateRequest := &api.CreateComputeTemplateRequest{ + ComputeTemplate: &api.ComputeTemplate{ + Name: e2etc.computeTemplateName, + Namespace: e2etc.namespaceName, + Cpu: 2, + Memory: 4, + }, + Namespace: e2etc.namespaceName, + } + + _, status, err := e2etc.kuberayAPIServerClient.CreateComputeTemplate(computeTemplateRequest) + if !assert.NoErrorf(t, err, "No error expected while creating a compute template (%s, %s)", e2etc.namespaceName, e2etc.computeTemplateName) { + t.Fatalf("Received status of %v when attempting to create compute template", status) + } +} + +func (e2etc *End2EndTestingContext) DeleteComputeTemplate(t *testing.T) { + deleteComputeTemplateRequest := &api.DeleteClusterRequest{ + Name: e2etc.computeTemplateName, + Namespace: e2etc.namespaceName, + } + status, err := e2etc.kuberayAPIServerClient.DeleteComputeTemplate((*api.DeleteComputeTemplateRequest)(deleteComputeTemplateRequest)) + if !assert.NoErrorf(t, err, "No error expected while deleting a compute template (%s, %s)", e2etc.computeTemplateName, e2etc.namespaceName) { + t.Fatalf("Received status of %v when attempting to create compute template", status) + } +} + +func (e2etc *End2EndTestingContext) CreateRayClusterWithConfigMaps(t *testing.T, configMapValues map[string]string) (*api.Cluster, string) { + configMapName := e2etc.CreateConfigMap(t, configMapValues) + t.Cleanup(func() { + e2etc.DeleteConfigMap(t, configMapName) + }) + items := make(map[string]string) + for k := range configMapValues { + items[k] = k + } + actualCluster, status, err := e2etc.kuberayAPIServerClient.CreateCluster(&api.CreateClusterRequest{ + Cluster: &api.Cluster{ + Name: e2etc.clusterName, + Namespace: e2etc.namespaceName, + User: "3cpo", + Environment: api.Cluster_DEV, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: e2etc.computeTemplateName, + Image: e2etc.rayImage, + ServiceType: "NodePort", + EnableIngress: false, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{ + { + MountPath: "/home/ray/samples", + VolumeType: api.Volume_CONFIGMAP, + Name: "code-sample", + Source: e2etc.configMapName, + Items: items, + }, + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "small-wg", + ComputeTemplate: e2etc.computeTemplateName, + Image: e2etc.rayImage, + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 5, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{ + { + MountPath: "/home/ray/samples", + VolumeType: api.Volume_CONFIGMAP, + Name: "code-sample", + Source: e2etc.configMapName, + Items: items, + }, + }, + }, + }, + }, + }, + Namespace: e2etc.namespaceName, + }) + if !assert.NoErrorf(t, err, "No error expected while creating cluster (%s/%s)", e2etc.namespaceName, e2etc.clusterName) { + t.Fatalf("Received status of %v when attempting to create a cluster", status) + } + // wait for the cluster to be in a running state for 3 minutes + // if is not in that state, return an error + err = wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayCluster, err00 := e2etc.GetRayClusterByName(actualCluster.Name) + if err00 != nil { + return true, err00 + } + t.Logf("Found cluster state of '%s' for ray cluster '%s'", rayCluster.Status.State, e2etc.GetRayClusterName()) + return rayCluster.Status.State == rayv1api.Ready, nil + }) + require.NoErrorf(t, err, "No error expected when getting ray cluster: '%s', err %v", e2etc.GetRayClusterName(), err) + return actualCluster, configMapName +} + +func (e2etc *End2EndTestingContext) DeleteRayCluster(t *testing.T, clusterName string) { + _, err := e2etc.kuberayAPIServerClient.DeleteCluster(&api.DeleteClusterRequest{ + Name: clusterName, + Namespace: e2etc.namespaceName, + }) + require.NoErrorf(t, err, "No error expected when deleting ray cluster: '%s', err %v", clusterName, err) + + // wait for the cluster to be deleted for 3 minutes + // if is not in that state, return an error + err = wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayCluster, err00 := e2etc.GetRayClusterByName(clusterName) + if err00 != nil && k8sApiErrors.IsNotFound(err00) { + return true, nil + } + t.Logf("Found cluster state of '%s' for ray cluster '%s'", rayCluster.Status.State, clusterName) + return false, nil + }) + require.NoErrorf(t, err, "No error expected when waiting for ray cluster: '%s' to be deleted, err %v", clusterName, err) +} + +func (e2etc *End2EndTestingContext) DeleteRayService(t *testing.T, serviceName string) { + _, err := e2etc.kuberayAPIServerClient.DeleteRayService(&api.DeleteRayServiceRequest{ + Name: serviceName, + Namespace: e2etc.namespaceName, + }) + + require.NoErrorf(t, err, "No error expected when deleting ray service: '%s', err %v", serviceName, err) + + // wait for the cluster to be deleted for 3 minutes + // if is not in that state, return an error + err = wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayService, err00 := e2etc.GetRayServiceByName(serviceName) + if err00 != nil && k8sApiErrors.IsNotFound(err00) { + return true, nil + } + t.Logf("Found service state of '%s' for ray cluster '%s'", rayService.Status.ServiceStatus, serviceName) + return false, nil + }) + require.NoErrorf(t, err, "No error expected when waiting to delete ray service: '%s', err %v", serviceName, err) +} + +func (e2etc *End2EndTestingContext) DeleteRayJobByName(t *testing.T, rayJobName string) { + _, err := e2etc.kuberayAPIServerClient.DeleteRayJob(&api.DeleteRayJobRequest{ + Name: rayJobName, + Namespace: e2etc.namespaceName, + }) + + require.NoErrorf(t, err, "No error expected when deleting ray job: '%s', err %v", rayJobName, err) + + // wait for the cluster to be deleted for 3 minutes + // if is not in that state, return an error + err = wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayJob, err00 := e2etc.GetRayJobByName(rayJobName) + if err00 != nil && k8sApiErrors.IsNotFound(err00) { + return true, nil + } + t.Logf("Found job state of '%s' for ray cluster '%s'", rayJob.Status.JobStatus, rayJobName) + return false, nil + }) + require.NoErrorf(t, err, "No error expected when waiting to delete ray job: '%s', err %v", rayJobName, err) +} + +func (e2etc *End2EndTestingContext) CreateConfigMap(t *testing.T, values map[string]string) string { + cm := &v1.ConfigMap{ + TypeMeta: metav1.TypeMeta{Kind: "ConfigMap", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{Name: e2etc.configMapName, Namespace: e2etc.namespaceName}, + Immutable: new(bool), + Data: values, + } + _, err := e2etc.k8client.CoreV1().ConfigMaps(e2etc.namespaceName).Create(e2etc.ctx, cm, metav1.CreateOptions{}) + require.NoErrorf(t, err, "No error expected when creating config map '%s' in namespace '%s'", e2etc.configMapName, e2etc.namespaceName) + return e2etc.configMapName +} + +func (e2etc *End2EndTestingContext) DeleteConfigMap(t *testing.T, configMapName string) { + err := e2etc.k8client.CoreV1().ConfigMaps(e2etc.namespaceName).Delete(e2etc.ctx, configMapName, metav1.DeleteOptions{}) + if err != nil { + assert.Truef(t, k8sApiErrors.IsNotFound(err), "Only IsNotFoundException allowed, received %v", err) + } +} diff --git a/apiserver/test/e2e/utils.go b/apiserver/test/e2e/utils.go new file mode 100644 index 0000000000..d056ec7088 --- /dev/null +++ b/apiserver/test/e2e/utils.go @@ -0,0 +1,57 @@ +package e2e + +import ( + "bytes" + "embed" + "encoding/json" + "io" + "net/http" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +//go:embed resources/*.py +var files embed.FS + +// CreateHttpRequest instantiates a http request for the specified endpoint and host +func CreateHttpRequest(method string, host string, endPoint string, body io.Reader) (*http.Request, error) { + url := host + endPoint + req, err := http.NewRequest(method, url, body) + if err != nil { + return nil, err + } + req.Header.Add("Accept", "application/json") + req.Header.Add("Content-Type", "application/json") + return req, nil +} + +// MakeBodyReader creates a io.Reader from the supplied string if is not empty after +// trimming the spaces +func MakeBodyReader(s string) io.Reader { + if strings.TrimSpace(s) != "" { + return strings.NewReader(s) + } + return nil +} + +// PrettyPrintResponseBody generates a "pretty" formatted JSON string from the body +func PrettyPrintResponseBody(body io.ReadCloser) (string, error) { + inputBytez, err := io.ReadAll(body) + if err != nil { + return "", err + } + var prettyJSON bytes.Buffer + error := json.Indent(&prettyJSON, inputBytez, "", "\t") + if error != nil { + return "", err + } + return prettyJSON.String(), nil +} + +func ReadFileAsString(t *testing.T, fileName string) string { + file, err := files.ReadFile(fileName) + require.NoErrorf(t, err, "No error expected when reading embedded file: '%s'", fileName) + return string(file) +}