Skip to content

Commit

Permalink
[RayJob]: Add RayJob with cluster selector sample
Browse files Browse the repository at this point in the history
  • Loading branch information
astefanutti committed Oct 18, 2023
1 parent 62519ac commit 2a56347
Showing 1 changed file with 149 additions and 0 deletions.
149 changes: 149 additions & 0 deletions ray-operator/config/samples/ray-job.cluster-selector.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# This YAML file is primarily for testing purposes. It creates a "long-running" RayCluster,
# and then creates a RayJob, whose cluster selector matches the name of that previously
# created RayCluster, so the job is submitted into it.
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: long-running-cluster
spec:
rayVersion: '2.7.0' # should match the Ray version in the image of the containers
# Ray head pod template
headGroupSpec:
headService:
metadata:
name: custom-ray-head-service-name
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams:
dashboard-host: '0.0.0.0'
#pod template
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.7.0
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265 # Ray dashboard
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: "1"
requests:
cpu: "1"
volumeMounts:
- mountPath: /home/ray/samples
name: code-sample
volumes:
- name: code-sample
configMap:
name: ray-job-code-sample
items:
- key: sample_code.py
path: sample_code.py
- key: fail_fast.py
path: fail_fast.py
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 1
minReplicas: 1
maxReplicas: 5
# logical group name, for this called small-group, also can be functional
groupName: small-group
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams: {}
#pod template
template:
spec:
containers:
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: rayproject/ray:2.7.0
lifecycle:
preStop:
exec:
command: [ "/bin/sh","-c","ray stop" ]
resources:
limits:
cpu: "1"
requests:
cpu: "1"
---
# This sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example.
# It is mounted into the container and executed to show the Ray job at work
apiVersion: v1
kind: ConfigMap
metadata:
name: ray-job-code-sample
data:
sample_code.py: |
import ray
import os
import requests
ray.init()
@ray.remote
class Counter:
def __init__(self):
# Used to verify runtimeEnv
self.name = os.getenv("counter_name")
assert self.name == "test_counter"
self.counter = 0
def inc(self):
self.counter += 1
def get_counter(self):
return "{} got {}".format(self.name, self.counter)
counter = Counter.remote()
for _ in range(5):
ray.get(counter.inc.remote())
print(ray.get(counter.get_counter.remote()))
# Verify that the correct runtime env was used for the job.
assert requests.__version__ == "2.26.0"
fail_fast.py: |
import sys
print >> sys.stderr, "Something is seriously wrong."
sys.exit(1)
---
# The RayJob, that references the RayCluster into which the Ray job is to be submitted.
# You can change the entrypoint to fail_fast.py for testing the case where the job fails.
apiVersion: ray.io/v1
kind: RayJob
metadata:
name: rayjob-cluster-selector
spec:
clusterSelector:
ray.io/cluster: long-running-cluster
entrypoint: python /home/ray/samples/sample_code.py
# entrypoint: python /home/ray/samples/fail_fast.py
runtimeEnvYAML: |
pip:
- requests==2.26.0
- pendulum==2.1.2
env_vars:
counter_name: test_counter
# ttlSecondsAfterFinished: 60
submitterPodTemplate:
spec:
containers:
- name: ray-job-submitter
image: rayproject/ray:2.7.0
resources:
requests:
cpu: 500m
memory: 200Mi
limits:
cpu: 500m
memory: 200Mi
restartPolicy: Never

0 comments on commit 2a56347

Please sign in to comment.