diff --git a/components/kubeflow/katib-launcher/Dockerfile b/components/kubeflow/katib-launcher/Dockerfile new file mode 100644 index 00000000000..651eaae1621 --- /dev/null +++ b/components/kubeflow/katib-launcher/Dockerfile @@ -0,0 +1,29 @@ +# Copyright 2019 The Kubeflow Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +FROM ubuntu:16.04 + +RUN apt-get update -y && \ + apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git && \ + easy_install pip && \ + pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 grpcio gcloud google-api-python-client protobuf kubernetes && \ + wget https://github.com/kubeflow/katib/archive/master.zip && unzip master.zip + +ENV PYTHONPATH $PYTHONPATH:/katib-master/pkg/api/python:/katib-master/py + +ADD build /ml + +RUN mkdir /usr/licenses && \ + /ml/license.sh /ml/third_party_licenses.csv /usr/licenses + +ENTRYPOINT ["python", "/ml/launch_study_job.py"] diff --git a/components/kubeflow/katib-launcher/build_image.sh b/components/kubeflow/katib-launcher/build_image.sh new file mode 100755 index 00000000000..1b96dba236f --- /dev/null +++ b/components/kubeflow/katib-launcher/build_image.sh @@ -0,0 +1,58 @@ +#!/bin/bash -e +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +while getopts ":hp:t:i:" opt; do + case "${opt}" in + h) echo "-p: project name" + echo "-t: tag name" + echo "-i: image name. If provided, project name and tag name are not necessary" + exit + ;; + p) PROJECT_ID=${OPTARG} + ;; + t) TAG_NAME=${OPTARG} + ;; + i) LAUNCHER_IMAGE_NAME=${OPTARG} + ;; + \? ) echo "Usage: cmd [-p] project [-t] tag [-i] image" + exit + ;; + esac +done + +mkdir -p ./build +rsync -arvp ./src/ ./build/ + +cp ../../license.sh ./build +cp ../../third_party_licenses.csv ./build + +LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-studyjob + +docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} . +if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then + if [ -z "${TAG_NAME}" ]; then + TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6) + fi + if [ -z "${PROJECT_ID}" ]; then + PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)") + fi + docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME} + docker push gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME} +else + docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} ${LAUNCHER_IMAGE_NAME} + docker push ${LAUNCHER_IMAGE_NAME} +fi + +rm -rf ./build diff --git a/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py b/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py new file mode 100644 index 00000000000..5a0a931d2ee --- /dev/null +++ b/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py @@ -0,0 +1,40 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import kfp.dsl as dsl + +def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, metricsnames, + parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec, + studyjob_timeout_minutes, output_file='/output.txt', step_name='StudyJob-Launcher'): + return dsl.ContainerOp( + name = step_name, + image = 'liuhougangxa/ml-pipeline-kubeflow-studyjob:latest', + arguments = [ + '--name', name, + '--namespace', namespace, + "--optimizationtype", optimizationtype, + "--objectivevaluename", objectivevaluename, + "--optimizationgoal", optimizationgoal, + "--requestcount", requestcount, + "--metricsnames", metricsnames, + "--parameterconfigs", parameterconfigs, + "--nasConfig", nasConfig, + "--workertemplatepath", workertemplatepath, + "--mcollectortemplatepath", mcollectortemplatepath, + "--suggestionspec", suggestionspec, + "--outputfile", output_file, + '--studyjobtimeoutminutes', studyjob_timeout_minutes, + ], + file_outputs = {'hyperparameter': output_file} + ) diff --git a/components/kubeflow/katib-launcher/src/__init__.py b/components/kubeflow/katib-launcher/src/__init__.py new file mode 100644 index 00000000000..4df8965533d --- /dev/null +++ b/components/kubeflow/katib-launcher/src/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .kubeflow_katib_launcher_op import kubeflow_studyjob_launcher_op diff --git a/components/kubeflow/katib-launcher/src/hp.template.yaml b/components/kubeflow/katib-launcher/src/hp.template.yaml new file mode 100644 index 00000000000..42494859ac0 --- /dev/null +++ b/components/kubeflow/katib-launcher/src/hp.template.yaml @@ -0,0 +1,49 @@ +apiVersion: "kubeflow.org/v1alpha1" +kind: StudyJob +metadata: + namespace: kubeflow + labels: + controller-tools.k8s.io: "1.0" + name: study-example +spec: + studyName: study-example + owner: crd + optimizationtype: "" + objectivevaluename: "" + optimizationgoal: 0.99 + requestcount: 4 + metricsnames: + - accuracy_1 + nasConfig: + graphConfig: + numLayers: 8 + inputSize: + - 32 + - 32 + - 3 + outputSize: + - 10 + operations: + - operationType: convolution + parameterconfigs: + - name: filter_size + parametertype: categorical + feasible: + list: + - "3" + - "5" + - "7" + parameterconfigs: + - name: --learning_rate + parametertype: double + feasible: + min: "0.01" + max: "0.05" + workerSpec: + goTemplate: + templatePath: "" + metricsCollectorSpec: + goTemplate: + templatePath: "" + suggestionSpec: + suggestionAlgorithm: "random" diff --git a/components/kubeflow/katib-launcher/src/launch_study_job.py b/components/kubeflow/katib-launcher/src/launch_study_job.py new file mode 100644 index 00000000000..1def4b51be2 --- /dev/null +++ b/components/kubeflow/katib-launcher/src/launch_study_job.py @@ -0,0 +1,164 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import datetime +import json +import os +import logging +import requests +import subprocess +import yaml +import grpc + +import api_pb2 +import api_pb2_grpc + +from kubernetes import client as k8s_client +from kubernetes import config +import study_job_client + +def yamlOrJsonStr(str): + if str == "" or str == None: + return None + try: + return json.loads(str) + except: + return yaml.load(str) + +def strToList(str): + return str.split(",") + +def _update_or_pop(spec, name, value): + if value: + spec[name] = value + else: + spec.pop(name) + +def _generate_studyjob_yaml(src_filename, name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, + metricsnames, parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec): + """_generate_studyjob_yaml generates studyjob yaml file based on hp.template.yaml""" + with open(src_filename, 'r') as f: + content = yaml.load(f) + + content['metadata']['name'] = name + content['metadata']['namespace'] = namespace + content['spec']['studyName'] = name + content['spec']['optimizationtype'] = optimizationtype + content['spec']['objectivevaluename'] = objectivevaluename + content['spec']['optimizationgoal'] = optimizationgoal + content['spec']['requestcount'] = requestcount + + _update_or_pop(content['spec'], 'parameterconfigs', parameterconfigs) + _update_or_pop(content['spec'], 'nasConfig', nasConfig) + _update_or_pop(content['spec'], 'metricsnames', metricsnames) + _update_or_pop(content['spec'], 'suggestionSpec', suggestionspec) + + if workertemplatepath: + content['spec']['workerSpec']['goTemplate']['templatePath'] = workertemplatepath + else: + content['spec'].pop('workerSpec') + + if mcollectortemplatepath: + content['spec']['metricsCollectorSpec']['goTemplate']['templatePath'] = mcollectortemplatepath + else : + content['spec'].pop('metricsCollectorSpec') + + return content + +def get_best_trial(trial_id): + vizier_core = "vizier-core.kubeflow:6789" + with grpc.insecure_channel(vizier_core) as channel: + stub = api_pb2_grpc.ManagerStub(channel) + response = stub.GetTrial(api_pb2.GetTrialRequest(trial_id=trial_id)) + return response.trial + +def main(argv=None): + parser = argparse.ArgumentParser(description='Kubeflow StudyJob launcher') + parser.add_argument('--name', type=str, + help='StudyJob name.') + parser.add_argument('--namespace', type=str, + default='kubeflow', + help='StudyJob namespace.') + parser.add_argument('--optimizationtype', type=str, + default='minimize', + help='Direction of optimization. minimize or maximize.') + parser.add_argument('--objectivevaluename', type=str, + help='Objective value name which trainer optimizes.') + parser.add_argument('--optimizationgoal', type=float, + help='Stop studying once objectivevaluename value ' + + 'exceeds optimizationgoal') + parser.add_argument('--requestcount', type=int, + default=1, + help='The times asking request to suggestion service.') + parser.add_argument('--metricsnames', type=strToList, + help='StudyJob metrics name list.') + parser.add_argument('--parameterconfigs', type=yamlOrJsonStr, + default={}, + help='StudyJob parameterconfigs.') + parser.add_argument('--nasConfig',type=yamlOrJsonStr, + default={}, + help='StudyJob nasConfig.') + parser.add_argument('--workertemplatepath', type=str, + default="", + help='StudyJob worker spec.') + parser.add_argument('--mcollectortemplatepath', type=str, + default="", + help='StudyJob worker spec.') + parser.add_argument('--suggestionspec', type=yamlOrJsonStr, + default={}, + help='StudyJob suggestion spec.') + parser.add_argument('--outputfile', type=str, + default='/output.txt', + help='The file which stores the best trial of the studyJob.') + parser.add_argument('--studyjobtimeoutminutes', type=int, + default=10, + help='Time in minutes to wait for the StudyJob to complete') + + args = parser.parse_args() + + logging.getLogger().setLevel(logging.INFO) + + + logging.info('Generating studyjob template.') + template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'hp.template.yaml') + content_yaml = _generate_studyjob_yaml(template_file, args.name, args.namespace, args.optimizationtype, args.objectivevaluename, + args.optimizationgoal, args.requestcount, args.metricsnames, args.parameterconfigs, + args.nasConfig, args.workertemplatepath, args.mcollectortemplatepath, args.suggestionspec) + + config.load_incluster_config() + api_client = k8s_client.ApiClient() + create_response = study_job_client.create_study_job(api_client, content_yaml) + job_name = create_response['metadata']['name'] + job_namespace = create_response['metadata']['namespace'] + + expected_condition = ["Completed", "Failed"] + wait_response = study_job_client.wait_for_condition( + api_client, job_namespace, job_name, expected_condition, + timeout=datetime.timedelta(minutes=args.studyjobtimeoutminutes)) + succ = False + if wait_response.get("status", {}).get("condition") == "Completed": + succ = True + trial = get_best_trial(wait_response["status"]["bestTrialId"]) + with open(args.outputfile, 'w') as f: + ps_dict = {} + for ps in trial.parameter_set: + ps_dict[ps.name] = ps.value + f.write(json.dumps(ps_dict)) + if succ: + logging.info('Study success.') + + study_job_client.delete_study_job(api_client, job_name, job_namespace) + +if __name__== "__main__": + main()