-
Notifications
You must be signed in to change notification settings - Fork 432
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[RayJob]: Add RayJob with cluster selector sample
- Loading branch information
1 parent
62519ac
commit 2a56347
Showing
1 changed file
with
149 additions
and
0 deletions.
There are no files selected for viewing
149 changes: 149 additions & 0 deletions
149
ray-operator/config/samples/ray-job.cluster-selector.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
# This YAML file is primarily for testing purposes. It creates a "long-running" RayCluster, | ||
# and then creates a RayJob, whose cluster selector matches the name of that previously | ||
# created RayCluster, so the job is submitted into it. | ||
apiVersion: ray.io/v1 | ||
kind: RayCluster | ||
metadata: | ||
name: long-running-cluster | ||
spec: | ||
rayVersion: '2.7.0' # should match the Ray version in the image of the containers | ||
# Ray head pod template | ||
headGroupSpec: | ||
headService: | ||
metadata: | ||
name: custom-ray-head-service-name | ||
# The `rayStartParams` are used to configure the `ray start` command. | ||
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. | ||
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. | ||
rayStartParams: | ||
dashboard-host: '0.0.0.0' | ||
#pod template | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-head | ||
image: rayproject/ray:2.7.0 | ||
ports: | ||
- containerPort: 6379 | ||
name: gcs-server | ||
- containerPort: 8265 # Ray dashboard | ||
name: dashboard | ||
- containerPort: 10001 | ||
name: client | ||
resources: | ||
limits: | ||
cpu: "1" | ||
requests: | ||
cpu: "1" | ||
volumeMounts: | ||
- mountPath: /home/ray/samples | ||
name: code-sample | ||
volumes: | ||
- name: code-sample | ||
configMap: | ||
name: ray-job-code-sample | ||
items: | ||
- key: sample_code.py | ||
path: sample_code.py | ||
- key: fail_fast.py | ||
path: fail_fast.py | ||
workerGroupSpecs: | ||
# the pod replicas in this group typed worker | ||
- replicas: 1 | ||
minReplicas: 1 | ||
maxReplicas: 5 | ||
# logical group name, for this called small-group, also can be functional | ||
groupName: small-group | ||
# The `rayStartParams` are used to configure the `ray start` command. | ||
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay. | ||
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`. | ||
rayStartParams: {} | ||
#pod template | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' | ||
image: rayproject/ray:2.7.0 | ||
lifecycle: | ||
preStop: | ||
exec: | ||
command: [ "/bin/sh","-c","ray stop" ] | ||
resources: | ||
limits: | ||
cpu: "1" | ||
requests: | ||
cpu: "1" | ||
--- | ||
# This sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example. | ||
# It is mounted into the container and executed to show the Ray job at work | ||
apiVersion: v1 | ||
kind: ConfigMap | ||
metadata: | ||
name: ray-job-code-sample | ||
data: | ||
sample_code.py: | | ||
import ray | ||
import os | ||
import requests | ||
ray.init() | ||
@ray.remote | ||
class Counter: | ||
def __init__(self): | ||
# Used to verify runtimeEnv | ||
self.name = os.getenv("counter_name") | ||
assert self.name == "test_counter" | ||
self.counter = 0 | ||
def inc(self): | ||
self.counter += 1 | ||
def get_counter(self): | ||
return "{} got {}".format(self.name, self.counter) | ||
counter = Counter.remote() | ||
for _ in range(5): | ||
ray.get(counter.inc.remote()) | ||
print(ray.get(counter.get_counter.remote())) | ||
# Verify that the correct runtime env was used for the job. | ||
assert requests.__version__ == "2.26.0" | ||
fail_fast.py: | | ||
import sys | ||
print >> sys.stderr, "Something is seriously wrong." | ||
sys.exit(1) | ||
--- | ||
# The RayJob, that references the RayCluster into which the Ray job is to be submitted. | ||
# You can change the entrypoint to fail_fast.py for testing the case where the job fails. | ||
apiVersion: ray.io/v1 | ||
kind: RayJob | ||
metadata: | ||
name: rayjob-cluster-selector | ||
spec: | ||
clusterSelector: | ||
ray.io/cluster: long-running-cluster | ||
entrypoint: python /home/ray/samples/sample_code.py | ||
# entrypoint: python /home/ray/samples/fail_fast.py | ||
runtimeEnvYAML: | | ||
pip: | ||
- requests==2.26.0 | ||
- pendulum==2.1.2 | ||
env_vars: | ||
counter_name: test_counter | ||
# ttlSecondsAfterFinished: 60 | ||
submitterPodTemplate: | ||
spec: | ||
containers: | ||
- name: ray-job-submitter | ||
image: rayproject/ray:2.7.0 | ||
resources: | ||
requests: | ||
cpu: 500m | ||
memory: 200Mi | ||
limits: | ||
cpu: 500m | ||
memory: 200Mi | ||
restartPolicy: Never |