Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't delete Ray head Pod when ImagePullSecret is provided #654

Merged
merged 1 commit into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ jobs:
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}

set -euo pipefail
go test -timeout 60m -v -skip "^Test.*Cpu$" ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
go test -timeout 120m -v -skip "^Test.*Cpu$" ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt

- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ require (
github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c
github.com/project-codeflare/appwrapper v0.30.0
github.com/project-codeflare/codeflare-common v0.0.0-20241216183607-222395d38924
github.com/project-codeflare/codeflare-common v0.0.0-20250117134355-5748d670cd4a
github.com/ray-project/kuberay/ray-operator v1.2.1
go.uber.org/zap v1.27.0
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/project-codeflare/appwrapper v0.30.0 h1:tb9LJ/QmC2xyKdM0oVf+WAz9cKIGt3gllDrRgzySgyo=
github.com/project-codeflare/appwrapper v0.30.0/go.mod h1:7FpO90DLv0BAq4rwZtXKS9aRRfkR9RvXsj3pgYF0HtQ=
github.com/project-codeflare/codeflare-common v0.0.0-20241216183607-222395d38924 h1:jM+gYqn8eGmUoeQLGGYxlJgXZ1gbZgB2UtpKU9z0x9s=
github.com/project-codeflare/codeflare-common v0.0.0-20241216183607-222395d38924/go.mod h1:DPSv5khRiRDFUD43SF8da+MrVQTWmxNhuKJmwSLOyO0=
github.com/project-codeflare/codeflare-common v0.0.0-20250117134355-5748d670cd4a h1:1F5xsxncIL5Bpboup8d5osQ8iWy/hzkCTtGSBZM2tQM=
github.com/project-codeflare/codeflare-common v0.0.0-20250117134355-5748d670cd4a/go.mod h1:DPSv5khRiRDFUD43SF8da+MrVQTWmxNhuKJmwSLOyO0=
github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI=
github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
Expand Down
7 changes: 5 additions & 2 deletions pkg/controllers/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,11 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{RequeueAfter: requeueTime}, err
}

if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil {
return ctrl.Result{RequeueAfter: requeueTime}, err
if len(cluster.Spec.HeadGroupSpec.Template.Spec.ImagePullSecrets) == 0 {
// Delete head pod only if user doesn't specify own imagePullSecrets and imagePullSecrets from OAuth ServiceAccount are not present in the head Pod
if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil {
return ctrl.Result{RequeueAfter: requeueTime}, err
}
}

_, err = r.kubeClient.RbacV1().ClusterRoleBindings().Apply(ctx, desiredOAuthClusterRoleBinding(cluster), metav1.ApplyOptions{FieldManager: controllerName, Force: true})
Expand Down
67 changes: 67 additions & 0 deletions pkg/controllers/raycluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,73 @@ var _ = Describe("RayCluster controller", func() {
}).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound))
})

It("should not delete the head pod if RayCluster CR provides image pull secrets", func(ctx SpecContext) {
By("creating an instance of the RayCluster CR with imagePullSecret")
rayclusterWithPullSecret := &rayv1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "pull-secret-cluster",
Namespace: namespaceName,
},
Spec: rayv1.RayClusterSpec{
HeadGroupSpec: rayv1.HeadGroupSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
ImagePullSecrets: []corev1.LocalObjectReference{{Name: "custom-pull-secret"}},
Containers: []corev1.Container{},
},
},
RayStartParams: map[string]string{},
},
},
}
_, err := rayClient.RayV1().RayClusters(namespaceName).Create(ctx, rayclusterWithPullSecret, metav1.CreateOptions{})
Expect(err).To(Not(HaveOccurred()))

Eventually(func() (*corev1.ServiceAccount, error) {
return k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(rayclusterWithPullSecret), metav1.GetOptions{})
}).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceKind, Equal("RayCluster")))

headPodName := "head-pod"
headPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: headPodName,
Namespace: namespaceName,
Labels: map[string]string{
"ray.io/node-type": "head",
"ray.io/cluster": rayclusterWithPullSecret.Name,
},
},
Spec: corev1.PodSpec{
ImagePullSecrets: []corev1.LocalObjectReference{
{Name: "custom-pull-secret"},
},
Containers: []corev1.Container{
{
Name: "head-container",
Image: "busybox",
},
},
},
}
_, err = k8sClient.CoreV1().Pods(namespaceName).Create(ctx, headPod, metav1.CreateOptions{})
Expect(err).To(Not(HaveOccurred()))

Eventually(func() (*corev1.Pod, error) {
return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
}).WithTimeout(time.Second * 10).ShouldNot(BeNil())

sa, err := k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(rayclusterWithPullSecret), metav1.GetOptions{})
Expect(err).To(Not(HaveOccurred()))

sa.ImagePullSecrets = append(sa.ImagePullSecrets, corev1.LocalObjectReference{Name: "test-image-pull-secret"})
_, err = k8sClient.CoreV1().ServiceAccounts(namespaceName).Update(ctx, sa, metav1.UpdateOptions{})
Expect(err).To(Not(HaveOccurred()))

Consistently(func() (*corev1.Pod, error) {
return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
}).WithTimeout(time.Second * 5).Should(Not(BeNil()))
})

It("should remove CRB when the RayCluster is deleted", func(ctx SpecContext) {
foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{})
Expect(err).To(Not(HaveOccurred()))
Expand Down
42 changes: 40 additions & 2 deletions test/e2e/mnist_rayjob_raycluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,44 @@ func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, number
test.Eventually(AppWrappers(test, namespace), TestTimeoutShort).Should(BeEmpty())
}

// Verifying https://github.com/project-codeflare/codeflare-operator/issues/649
func TestRayClusterImagePullSecret(t *testing.T) {
test := With(t)

// Create a namespace
namespace := test.NewTestNamespace()

// Create Kueue resources
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
defer func() {
_ = test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
}()
clusterQueue := createClusterQueue(test, resourceFlavor, 0)
defer func() {
_ = test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
}()
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)

// Create MNIST training script
mnist := constructMNISTConfigMap(test, namespace)
mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)

// Create RayCluster with imagePullSecret and assign it to the localqueue
rayCluster := constructRayCluster(test, namespace, mnist, 0)
rayCluster.Spec.HeadGroupSpec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: "custom-pull-secret"}}
rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)

test.T().Logf("Waiting for RayCluster %s/%s to be running", rayCluster.Namespace, rayCluster.Name)
test.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
}

// Helper functions

func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.ConfigMap {
return &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
Expand Down Expand Up @@ -274,11 +312,11 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("250m"),
corev1.ResourceMemory: resource.MustParse("512Mi"),
corev1.ResourceMemory: resource.MustParse("2G"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("2G"),
corev1.ResourceMemory: resource.MustParse("4G"),
},
},
},
Expand Down
Loading