From 465379be06f865c4b13959d388c90e65738a9950 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 16 Sep 2020 15:45:53 +0100 Subject: [PATCH 1/4] Tekton example Add README for Tekton examples Add yaml with PipelineRun --- examples/v1beta1/tekton/README.md | 41 +++++++ examples/v1beta1/tekton/pipeline-run.yaml | 104 ++++++++++++++++++ .../katib-controller/katib-controller.yaml | 1 + manifests/v1beta1/katib-controller/rbac.yaml | 6 + 4 files changed, 152 insertions(+) create mode 100644 examples/v1beta1/tekton/README.md create mode 100644 examples/v1beta1/tekton/pipeline-run.yaml diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md new file mode 100644 index 00000000000..d1eb9ca90cf --- /dev/null +++ b/examples/v1beta1/tekton/README.md @@ -0,0 +1,41 @@ +# Katib examples with Tekton integration + +Here you can find examples of using Katib with [Tekton](https://github.com/tektoncd/pipeline). +Check [here](https://github.com/tektoncd/pipeline/blob/master/docs/install.md#installing-tekton-pipelines-on-kubernetes) how to install Tekton on your cluster. + +**Note** that you must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) image to run Tekton pipelines. `Nop` images is used to stop sidecar containers after main container is completed. Metrics collector must be not stopped after training container is finished. To avoid this problem, `nop` image should be equal to metrics collector sidecar image. + +For example, if you are using [StdOut](https://www.kubeflow.org/docs/components/hyperparameter-tuning/experiment/#metrics-collector) metrics collector, `nop` image must be equal to `gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector`. + +After deploying Tekton on your cluster, run bellow command to modify `nop` image. + +```bash +kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \ +-p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector"}]' +``` + +Check that Tekton controller's pod was restarted: + +``` +kubectl get pods -n tekton-pipelines +``` + +Expected output: + +``` +NAME READY STATUS RESTARTS AGE +tekton-pipelines-controller-7fcb6c6cd4-p8zf2 1/1 Running 0 2m2s +tekton-pipelines-webhook-7f9888f9b-7d6mr 1/1 Running 0 12h +``` + +Check that `nop` image was modified: + +``` +kubectl get pod -n tekton-pipelines- -o yaml | grep katib/v1beta1/file-metrics-collector +``` + +Expected output: + +``` +- gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector +``` diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml new file mode 100644 index 00000000000..dfe0de8b350 --- /dev/null +++ b/examples/v1beta1/tekton/pipeline-run.yaml @@ -0,0 +1,104 @@ +# This examples shows how you can use Tekton Pipelines in Katib. +# PipelineRun shows how you can transfer parameters from one Task to another and run HP job. +# It uses simple random algorithm and tunes only learning rate. +# Pipelines contains 2 Tasks, first is data-preprocessing second is model-training. +# First Task shows how you can prepare your training data (simply divide number of training examples) before running HP job. +# Number of examples is transferred to the second Task. +# Second Task is the actual training which metrics collector sidecar is injected. +# Note that for this example Tekton controller's nop image must be equal to StdOut metrics collector image. +apiVersion: "kubeflow.org/v1beta1" +kind: Experiment +metadata: + namespace: kubeflow + name: tekton-pipeline-run +spec: + objective: + type: maximize + goal: 0.99 + objectiveMetricName: Validation-accuracy + additionalMetricNames: + - Train-accuracy + algorithm: + algorithmName: random + parallelTrialCount: 2 + maxTrialCount: 4 + maxFailedTrialCount: 3 + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.03" + trialTemplate: + retain: true + primaryPodLabels: + tekton.dev/pipelineTask: model-training + primaryContainerName: step-model-training + successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")# + failureCondition: status.conditions.#(type=="Succeeded")#|#(status=="False")# + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + trialSpec: + apiVersion: tekton.dev/v1beta1 + kind: PipelineRun + spec: + params: + - name: lr + value: ${trialParameters.learningRate} + - name: num-examples-init + value: "60000" + pipelineSpec: + params: + - name: lr + description: Learning rate for the training model + - name: num-examples-init + description: Initial value for number of training examples + tasks: + - name: data-preprocessing + params: + - name: num-examples-pre + value: $(params.num-examples-init) + taskSpec: + metadata: + annotations: + sidecar.istio.io/inject: "false" + params: + - name: num-examples-pre + description: Number of training examples before optimization + results: + - name: num-examples-post + description: Number of training examples after optimization + steps: + - name: num-examples-optimize + image: python:alpine3.6 + command: + - sh + - -c + args: + - python3 -c "import random; print($(params.num-examples-pre)//random.randint(10,100),end='')" | tee $(results.num-examples-post.path) + - name: model-training + params: + - name: lr + value: $(params.lr) + - name: num-examples + value: $(tasks.data-preprocessing.results.num-examples-post) + taskSpec: + metadata: + annotations: + sidecar.istio.io/inject: "false" + params: + - name: lr + description: Learning rate for the training model + - name: num-examples + description: Number of training examples + steps: + - name: model-training + image: docker.io/kubeflowkatib/mxnet-mnist + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--num-examples=$(params.num-examples)" + - "--lr=$(params.lr)" diff --git a/manifests/v1beta1/katib-controller/katib-controller.yaml b/manifests/v1beta1/katib-controller/katib-controller.yaml index 4dfb01fd445..4d362383bcc 100644 --- a/manifests/v1beta1/katib-controller/katib-controller.yaml +++ b/manifests/v1beta1/katib-controller/katib-controller.yaml @@ -29,6 +29,7 @@ spec: - "--trial-resources=TFJob.v1.kubeflow.org" - "--trial-resources=PyTorchJob.v1.kubeflow.org" - "--trial-resources=MPIJob.v1.kubeflow.org" + - "--trial-resources=PipelineRun.v1beta1.tekton.dev" ports: - containerPort: 8443 name: webhook diff --git a/manifests/v1beta1/katib-controller/rbac.yaml b/manifests/v1beta1/katib-controller/rbac.yaml index dc12f9f3db7..a902f47faf8 100644 --- a/manifests/v1beta1/katib-controller/rbac.yaml +++ b/manifests/v1beta1/katib-controller/rbac.yaml @@ -73,6 +73,12 @@ rules: - mpijobs verbs: - "*" + - apiGroups: + - tekton.dev + resources: + - pipelineruns + verbs: + - "*" --- apiVersion: v1 kind: ServiceAccount From 4d0dc5bf34f640eef451eec37f0f757b924ef11b Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 16 Sep 2020 16:08:17 +0100 Subject: [PATCH 2/4] Fix README --- examples/v1beta1/tekton/README.md | 35 +++++++++++------------ examples/v1beta1/tekton/pipeline-run.yaml | 8 ++---- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md index d1eb9ca90cf..2caa44867cf 100644 --- a/examples/v1beta1/tekton/README.md +++ b/examples/v1beta1/tekton/README.md @@ -1,28 +1,31 @@ # Katib examples with Tekton integration Here you can find examples of using Katib with [Tekton](https://github.com/tektoncd/pipeline). -Check [here](https://github.com/tektoncd/pipeline/blob/master/docs/install.md#installing-tekton-pipelines-on-kubernetes) how to install Tekton on your cluster. -**Note** that you must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) image to run Tekton pipelines. `Nop` images is used to stop sidecar containers after main container is completed. Metrics collector must be not stopped after training container is finished. To avoid this problem, `nop` image should be equal to metrics collector sidecar image. +Check [here](https://github.com/tektoncd/pipeline/blob/master/docs/install.md#installing-tekton-pipelines-on-kubernetes) +how to install Tekton on your cluster. -For example, if you are using [StdOut](https://www.kubeflow.org/docs/components/hyperparameter-tuning/experiment/#metrics-collector) metrics collector, `nop` image must be equal to `gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector`. +**Note** that you must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) +image to run Tekton pipelines. `Nop` image is used to stop sidecar containers after main container +is completed. Metrics collector should not be stopped after training container is finished. +To avoid this problem, set `nop` image to metrics collector sidecar image. -After deploying Tekton on your cluster, run bellow command to modify `nop` image. +For example, if you are using +[StdOut](https://www.kubeflow.org/docs/components/hyperparameter-tuning/experiment/#metrics-collector) metrics collector, +`nop` image must be equal to `gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector`. + +After deploying Tekton on your cluster, run bellow command to modify `nop` image: ```bash kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \ --p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector"}]' + -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector"}]' ``` Check that Tekton controller's pod was restarted: -``` -kubectl get pods -n tekton-pipelines -``` - -Expected output: +```bash +$ kubectl get pods -n tekton-pipelines -``` NAME READY STATUS RESTARTS AGE tekton-pipelines-controller-7fcb6c6cd4-p8zf2 1/1 Running 0 2m2s tekton-pipelines-webhook-7f9888f9b-7d6mr 1/1 Running 0 12h @@ -30,12 +33,8 @@ tekton-pipelines-webhook-7f9888f9b-7d6mr 1/1 Running 0 12h Check that `nop` image was modified: -``` -kubectl get pod -n tekton-pipelines- -o yaml | grep katib/v1beta1/file-metrics-collector -``` - -Expected output: +```bash +$ kubectl get pod -n tekton-pipelines -o yaml | grep katib/v1beta1/file-metrics-collector -``` -- gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector + - gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector ``` diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml index dfe0de8b350..b8aac248b0d 100644 --- a/examples/v1beta1/tekton/pipeline-run.yaml +++ b/examples/v1beta1/tekton/pipeline-run.yaml @@ -1,9 +1,8 @@ -# This examples shows how you can use Tekton Pipelines in Katib. -# PipelineRun shows how you can transfer parameters from one Task to another and run HP job. +# This examples shows how you can use Tekton Pipelines in Katib, transfer parameters from one Task to another and run HP job. # It uses simple random algorithm and tunes only learning rate. # Pipelines contains 2 Tasks, first is data-preprocessing second is model-training. -# First Task shows how you can prepare your training data (simply divide number of training examples) before running HP job. -# Number of examples is transferred to the second Task. +# First Task shows how you can prepare your training data (here: simply divide number of training examples) before running HP job. +# Number of training examples is transferred to the second Task. # Second Task is the actual training which metrics collector sidecar is injected. # Note that for this example Tekton controller's nop image must be equal to StdOut metrics collector image. apiVersion: "kubeflow.org/v1beta1" @@ -99,6 +98,5 @@ spec: command: - "python3" - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - "--num-examples=$(params.num-examples)" - "--lr=$(params.lr)" From 8d402ad78fa7c13ae03300e82490b27f0df34299 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 14 Oct 2020 15:56:15 +0100 Subject: [PATCH 3/4] Remove istio annotation --- examples/v1beta1/tekton/pipeline-run.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml index b8aac248b0d..839aa86190d 100644 --- a/examples/v1beta1/tekton/pipeline-run.yaml +++ b/examples/v1beta1/tekton/pipeline-run.yaml @@ -60,9 +60,6 @@ spec: - name: num-examples-pre value: $(params.num-examples-init) taskSpec: - metadata: - annotations: - sidecar.istio.io/inject: "false" params: - name: num-examples-pre description: Number of training examples before optimization @@ -84,9 +81,6 @@ spec: - name: num-examples value: $(tasks.data-preprocessing.results.num-examples-post) taskSpec: - metadata: - annotations: - sidecar.istio.io/inject: "false" params: - name: lr description: Learning rate for the training model From b4e61e48e77abf61675b3d8af84307ac60697f53 Mon Sep 17 00:00:00 2001 From: avelichk Date: Sat, 17 Oct 2020 03:06:37 +0100 Subject: [PATCH 4/4] Fix comment --- examples/v1beta1/tekton/pipeline-run.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml index 839aa86190d..4a9d12bc335 100644 --- a/examples/v1beta1/tekton/pipeline-run.yaml +++ b/examples/v1beta1/tekton/pipeline-run.yaml @@ -1,4 +1,4 @@ -# This examples shows how you can use Tekton Pipelines in Katib, transfer parameters from one Task to another and run HP job. +# This example shows how you can use Tekton Pipelines in Katib, transfer parameters from one Task to another and run HP job. # It uses simple random algorithm and tunes only learning rate. # Pipelines contains 2 Tasks, first is data-preprocessing second is model-training. # First Task shows how you can prepare your training data (here: simply divide number of training examples) before running HP job.