forked from kubeflow/pipelines
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add katib studyjob launcher (kubeflow#754)
* add katib studyjob launcher * delete tmp file * fix link error to tf-laucher * import studyjob client from katib project * specify output file with a parameter undo tf-launcher
- Loading branch information
1 parent
6ed804b
commit 7737025
Showing
6 changed files
with
355 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# Copyright 2019 The Kubeflow Authors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
FROM ubuntu:16.04 | ||
|
||
RUN apt-get update -y && \ | ||
apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git && \ | ||
easy_install pip && \ | ||
pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 grpcio gcloud google-api-python-client protobuf kubernetes && \ | ||
wget https://github.com/kubeflow/katib/archive/master.zip && unzip master.zip | ||
|
||
ENV PYTHONPATH $PYTHONPATH:/katib-master/pkg/api/python:/katib-master/py | ||
|
||
ADD build /ml | ||
|
||
RUN mkdir /usr/licenses && \ | ||
/ml/license.sh /ml/third_party_licenses.csv /usr/licenses | ||
|
||
ENTRYPOINT ["python", "/ml/launch_study_job.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/bin/bash -e | ||
# Copyright 2019 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
while getopts ":hp:t:i:" opt; do | ||
case "${opt}" in | ||
h) echo "-p: project name" | ||
echo "-t: tag name" | ||
echo "-i: image name. If provided, project name and tag name are not necessary" | ||
exit | ||
;; | ||
p) PROJECT_ID=${OPTARG} | ||
;; | ||
t) TAG_NAME=${OPTARG} | ||
;; | ||
i) LAUNCHER_IMAGE_NAME=${OPTARG} | ||
;; | ||
\? ) echo "Usage: cmd [-p] project [-t] tag [-i] image" | ||
exit | ||
;; | ||
esac | ||
done | ||
|
||
mkdir -p ./build | ||
rsync -arvp ./src/ ./build/ | ||
|
||
cp ../../license.sh ./build | ||
cp ../../third_party_licenses.csv ./build | ||
|
||
LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-studyjob | ||
|
||
docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} . | ||
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then | ||
if [ -z "${TAG_NAME}" ]; then | ||
TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6) | ||
fi | ||
if [ -z "${PROJECT_ID}" ]; then | ||
PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)") | ||
fi | ||
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME} | ||
docker push gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME} | ||
else | ||
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} ${LAUNCHER_IMAGE_NAME} | ||
docker push ${LAUNCHER_IMAGE_NAME} | ||
fi | ||
|
||
rm -rf ./build |
40 changes: 40 additions & 0 deletions
40
components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# Copyright 2019 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import kfp.dsl as dsl | ||
|
||
def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, metricsnames, | ||
parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec, | ||
studyjob_timeout_minutes, output_file='/output.txt', step_name='StudyJob-Launcher'): | ||
return dsl.ContainerOp( | ||
name = step_name, | ||
image = 'liuhougangxa/ml-pipeline-kubeflow-studyjob:latest', | ||
arguments = [ | ||
'--name', name, | ||
'--namespace', namespace, | ||
"--optimizationtype", optimizationtype, | ||
"--objectivevaluename", objectivevaluename, | ||
"--optimizationgoal", optimizationgoal, | ||
"--requestcount", requestcount, | ||
"--metricsnames", metricsnames, | ||
"--parameterconfigs", parameterconfigs, | ||
"--nasConfig", nasConfig, | ||
"--workertemplatepath", workertemplatepath, | ||
"--mcollectortemplatepath", mcollectortemplatepath, | ||
"--suggestionspec", suggestionspec, | ||
"--outputfile", output_file, | ||
'--studyjobtimeoutminutes', studyjob_timeout_minutes, | ||
], | ||
file_outputs = {'hyperparameter': output_file} | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Copyright 2019 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from .kubeflow_katib_launcher_op import kubeflow_studyjob_launcher_op |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
apiVersion: "kubeflow.org/v1alpha1" | ||
kind: StudyJob | ||
metadata: | ||
namespace: kubeflow | ||
labels: | ||
controller-tools.k8s.io: "1.0" | ||
name: study-example | ||
spec: | ||
studyName: study-example | ||
owner: crd | ||
optimizationtype: "" | ||
objectivevaluename: "" | ||
optimizationgoal: 0.99 | ||
requestcount: 4 | ||
metricsnames: | ||
- accuracy_1 | ||
nasConfig: | ||
graphConfig: | ||
numLayers: 8 | ||
inputSize: | ||
- 32 | ||
- 32 | ||
- 3 | ||
outputSize: | ||
- 10 | ||
operations: | ||
- operationType: convolution | ||
parameterconfigs: | ||
- name: filter_size | ||
parametertype: categorical | ||
feasible: | ||
list: | ||
- "3" | ||
- "5" | ||
- "7" | ||
parameterconfigs: | ||
- name: --learning_rate | ||
parametertype: double | ||
feasible: | ||
min: "0.01" | ||
max: "0.05" | ||
workerSpec: | ||
goTemplate: | ||
templatePath: "" | ||
metricsCollectorSpec: | ||
goTemplate: | ||
templatePath: "" | ||
suggestionSpec: | ||
suggestionAlgorithm: "random" |
164 changes: 164 additions & 0 deletions
164
components/kubeflow/katib-launcher/src/launch_study_job.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
# Copyright 2018 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import argparse | ||
import datetime | ||
import json | ||
import os | ||
import logging | ||
import requests | ||
import subprocess | ||
import yaml | ||
import grpc | ||
|
||
import api_pb2 | ||
import api_pb2_grpc | ||
|
||
from kubernetes import client as k8s_client | ||
from kubernetes import config | ||
import study_job_client | ||
|
||
def yamlOrJsonStr(str): | ||
if str == "" or str == None: | ||
return None | ||
try: | ||
return json.loads(str) | ||
except: | ||
return yaml.load(str) | ||
|
||
def strToList(str): | ||
return str.split(",") | ||
|
||
def _update_or_pop(spec, name, value): | ||
if value: | ||
spec[name] = value | ||
else: | ||
spec.pop(name) | ||
|
||
def _generate_studyjob_yaml(src_filename, name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, | ||
metricsnames, parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec): | ||
"""_generate_studyjob_yaml generates studyjob yaml file based on hp.template.yaml""" | ||
with open(src_filename, 'r') as f: | ||
content = yaml.load(f) | ||
|
||
content['metadata']['name'] = name | ||
content['metadata']['namespace'] = namespace | ||
content['spec']['studyName'] = name | ||
content['spec']['optimizationtype'] = optimizationtype | ||
content['spec']['objectivevaluename'] = objectivevaluename | ||
content['spec']['optimizationgoal'] = optimizationgoal | ||
content['spec']['requestcount'] = requestcount | ||
|
||
_update_or_pop(content['spec'], 'parameterconfigs', parameterconfigs) | ||
_update_or_pop(content['spec'], 'nasConfig', nasConfig) | ||
_update_or_pop(content['spec'], 'metricsnames', metricsnames) | ||
_update_or_pop(content['spec'], 'suggestionSpec', suggestionspec) | ||
|
||
if workertemplatepath: | ||
content['spec']['workerSpec']['goTemplate']['templatePath'] = workertemplatepath | ||
else: | ||
content['spec'].pop('workerSpec') | ||
|
||
if mcollectortemplatepath: | ||
content['spec']['metricsCollectorSpec']['goTemplate']['templatePath'] = mcollectortemplatepath | ||
else : | ||
content['spec'].pop('metricsCollectorSpec') | ||
|
||
return content | ||
|
||
def get_best_trial(trial_id): | ||
vizier_core = "vizier-core.kubeflow:6789" | ||
with grpc.insecure_channel(vizier_core) as channel: | ||
stub = api_pb2_grpc.ManagerStub(channel) | ||
response = stub.GetTrial(api_pb2.GetTrialRequest(trial_id=trial_id)) | ||
return response.trial | ||
|
||
def main(argv=None): | ||
parser = argparse.ArgumentParser(description='Kubeflow StudyJob launcher') | ||
parser.add_argument('--name', type=str, | ||
help='StudyJob name.') | ||
parser.add_argument('--namespace', type=str, | ||
default='kubeflow', | ||
help='StudyJob namespace.') | ||
parser.add_argument('--optimizationtype', type=str, | ||
default='minimize', | ||
help='Direction of optimization. minimize or maximize.') | ||
parser.add_argument('--objectivevaluename', type=str, | ||
help='Objective value name which trainer optimizes.') | ||
parser.add_argument('--optimizationgoal', type=float, | ||
help='Stop studying once objectivevaluename value ' + | ||
'exceeds optimizationgoal') | ||
parser.add_argument('--requestcount', type=int, | ||
default=1, | ||
help='The times asking request to suggestion service.') | ||
parser.add_argument('--metricsnames', type=strToList, | ||
help='StudyJob metrics name list.') | ||
parser.add_argument('--parameterconfigs', type=yamlOrJsonStr, | ||
default={}, | ||
help='StudyJob parameterconfigs.') | ||
parser.add_argument('--nasConfig',type=yamlOrJsonStr, | ||
default={}, | ||
help='StudyJob nasConfig.') | ||
parser.add_argument('--workertemplatepath', type=str, | ||
default="", | ||
help='StudyJob worker spec.') | ||
parser.add_argument('--mcollectortemplatepath', type=str, | ||
default="", | ||
help='StudyJob worker spec.') | ||
parser.add_argument('--suggestionspec', type=yamlOrJsonStr, | ||
default={}, | ||
help='StudyJob suggestion spec.') | ||
parser.add_argument('--outputfile', type=str, | ||
default='/output.txt', | ||
help='The file which stores the best trial of the studyJob.') | ||
parser.add_argument('--studyjobtimeoutminutes', type=int, | ||
default=10, | ||
help='Time in minutes to wait for the StudyJob to complete') | ||
|
||
args = parser.parse_args() | ||
|
||
logging.getLogger().setLevel(logging.INFO) | ||
|
||
|
||
logging.info('Generating studyjob template.') | ||
template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'hp.template.yaml') | ||
content_yaml = _generate_studyjob_yaml(template_file, args.name, args.namespace, args.optimizationtype, args.objectivevaluename, | ||
args.optimizationgoal, args.requestcount, args.metricsnames, args.parameterconfigs, | ||
args.nasConfig, args.workertemplatepath, args.mcollectortemplatepath, args.suggestionspec) | ||
|
||
config.load_incluster_config() | ||
api_client = k8s_client.ApiClient() | ||
create_response = study_job_client.create_study_job(api_client, content_yaml) | ||
job_name = create_response['metadata']['name'] | ||
job_namespace = create_response['metadata']['namespace'] | ||
|
||
expected_condition = ["Completed", "Failed"] | ||
wait_response = study_job_client.wait_for_condition( | ||
api_client, job_namespace, job_name, expected_condition, | ||
timeout=datetime.timedelta(minutes=args.studyjobtimeoutminutes)) | ||
succ = False | ||
if wait_response.get("status", {}).get("condition") == "Completed": | ||
succ = True | ||
trial = get_best_trial(wait_response["status"]["bestTrialId"]) | ||
with open(args.outputfile, 'w') as f: | ||
ps_dict = {} | ||
for ps in trial.parameter_set: | ||
ps_dict[ps.name] = ps.value | ||
f.write(json.dumps(ps_dict)) | ||
if succ: | ||
logging.info('Study success.') | ||
|
||
study_job_client.delete_study_job(api_client, job_name, job_namespace) | ||
|
||
if __name__== "__main__": | ||
main() |