From 495f650cd7552399eb09b08b522fb045ddf78153 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Fri, 30 Nov 2018 15:51:33 +0800 Subject: [PATCH] Add AKS document (#422) 1.Add kubeflow in experiment config document 2.Add AKS in kubeflow document --- docs/ExperimentConfig.md | 369 ++++++++++++++++-- docs/KubeflowMode.md | 20 + .../mnist-annotation/config_kubeflow.yml | 29 ++ .../mnist-smartparam/config_kubeflow.yml | 29 ++ examples/trials/mnist/config_kubeflow.yml | 30 ++ 5 files changed, 434 insertions(+), 43 deletions(-) create mode 100644 examples/trials/mnist-annotation/config_kubeflow.yml create mode 100644 examples/trials/mnist-smartparam/config_kubeflow.yml create mode 100644 examples/trials/mnist/config_kubeflow.yml diff --git a/docs/ExperimentConfig.md b/docs/ExperimentConfig.md index 21f086c0f5..53cd171465 100644 --- a/docs/ExperimentConfig.md +++ b/docs/ExperimentConfig.md @@ -1,9 +1,8 @@ # Experiment config reference -=== -If you want to create a new nni experiment, you need to prepare a config file in your local machine, and provide the path of this file to nnictl. +A config file is needed when create an experiment, the path of the config file is provide to nnictl. The config file is written in yaml format, and need to be written correctly. -This document describes the rule to write config file, and will provide some examples and templates for you. +This document describes the rule to write config file, and will provide some examples and templates. ## Template * __light weight(without Annotation and Assessor)__ ``` @@ -12,7 +11,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote, pai +#choice: local, remote, pai, kubeflow trainingServicePlatform: searchSpacePath: #choice: true, false @@ -42,7 +41,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote, pai +#choice: local, remote, pai, kubeflow trainingServicePlatform: searchSpacePath: #choice: true, false @@ -79,7 +78,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote, pai +#choice: local, remote, pai, kubeflow trainingServicePlatform: #choice: true, false useAnnotation: @@ -118,7 +117,7 @@ machineList: * __experimentName__ * Description - __experimentName__ is the name of the experiment you created. + __experimentName__ is the name of the experiment created. TBD: add default value * __trialConcurrency__ @@ -126,7 +125,7 @@ machineList: __trialConcurrency__ specifies the max num of trial jobs run simultaneously. - Note: if you set trialGpuNum bigger than the free gpu numbers in your machine, and the trial jobs running simultaneously can not reach trialConcurrency number, some trial jobs will be put into a queue to wait for gpu allocation. + Note: if trialGpuNum is bigger than the free gpu numbers, and the trial jobs running simultaneously can not reach trialConcurrency number, some trial jobs will be put into a queue to wait for gpu allocation. * __maxExecDuration__ * Description @@ -141,37 +140,50 @@ machineList: * __trainingServicePlatform__ * Description - __trainingServicePlatform__ specifies the platform to run the experiment, including {__local__, __remote__}. - * __local__ mode means you run an experiment in your local linux machine. + __trainingServicePlatform__ specifies the platform to run the experiment, including {__local__, __remote__, __pai__, __kubeflow__}. - * __remote__ mode means you submit trial jobs to remote linux machines. If you set platform as remote, you should complete __machineList__ field. + * __local__ run an experiment on local ubuntu machine. + + + * __remote__ submit trial jobs to remote ubuntu machines, and __machineList__ field should be filed in order to set up SSH connection to remote machine. - * __pai__ mode means you submit trial jobs to [OpenPai](https://github.com/Microsoft/pai) of Microsoft. For more details of pai configuration, please reference [PAIMOdeDoc](./PAIMode.md) + + * __pai__ submit trial jobs to [OpenPai](https://github.com/Microsoft/pai) of Microsoft. For more details of pai configuration, please reference [PAIMOdeDoc](./PAIMode.md) + + * __kubeflow__ submit trial jobs to [kubeflow](https://www.kubeflow.org/docs/about/kubeflow/), nni support kubeflow based on normal kubernets and [azure kubernets](https://azure.microsoft.com/en-us/services/kubernetes-service/). * __searchSpacePath__ * Description - __searchSpacePath__ specifies the path of search space file you want to use, which should be a valid path in your local linux machine. + __searchSpacePath__ specifies the path of search space file, which should be a valid path in the local linux machine. - Note: if you set useAnnotation=True, you should remove searchSpacePath field or just let it be empty. + Note: if set useAnnotation=True, the searchSpacePath field should be removed. * __useAnnotation__ * Description - __useAnnotation__ means whether you use annotation to analysis your code and generate search space. + __useAnnotation__ use annotation to analysis trial code and generate search space. + + Note: if set useAnnotation=True, the searchSpacePath field should be removed. + +* __nniManagerIp__ + * Description + + __nniManagerIp__ set the IP address of the machine on which nni manager process runs. This field is optional, and if it's not set, eth0 device IP will be used instead. + + Note: run ifconfig on NNI manager's machine to check if eth0 device exists. If not, we recommend to set nnimanagerIp explicitly. - Note: if you set useAnnotation=True, you should not set searchSpacePath. * __tuner__ * Description - __tuner__ specifies the tuner algorithm you use to run an experiment, there are two kinds of ways to set tuner. One way is to use tuner provided by nni sdk, you just need to set __builtinTunerName__ and __classArgs__. Another way is to use your own tuner file, and you need to set __codeDirectory__, __classFileName__, __className__ and __classArgs__. + __tuner__ specifies the tuner algorithm in the experiment, there are two kinds of ways to set tuner. One way is to use tuner provided by nni sdk, need to set __builtinTunerName__ and __classArgs__. Another way is to use users' own tuner file, and need to set __codeDirectory__, __classFileName__, __className__ and __classArgs__. * __builtinTunerName__ and __classArgs__ * __builtinTunerName__ - __builtinTunerName__ specifies the name of system tuner you want to use, nni sdk provides four kinds of tuner, including {__TPE__, __Random__, __Anneal__, __Evolution__, __BatchTuner__, __GridSearch__} + __builtinTunerName__ specifies the name of system tuner, nni sdk provides four kinds of tuner, including {__TPE__, __Random__, __Anneal__, __Evolution__, __BatchTuner__, __GridSearch__} * __classArgs__ - __classArgs__ specifies the arguments of tuner algorithm. If the __builtinTunerName__ is in {__TPE__, __Random__, __Anneal__, __Evolution__}, you should set __optimize_mode__. + __classArgs__ specifies the arguments of tuner algorithm. If the __builtinTunerName__ is in {__TPE__, __Random__, __Anneal__, __Evolution__}, user should set __optimize_mode__. * __codeDir__, __classFileName__, __className__ and __classArgs__ * __codeDir__ @@ -187,19 +199,19 @@ machineList: __classArgs__ specifies the arguments of tuner algorithm. * __gpuNum__ - __gpuNum__ specifies the gpu number you want to use to run the tuner process. The value of this field should be a positive number. + __gpuNum__ specifies the gpu number to run the tuner process. The value of this field should be a positive number. - Note: you could only specify one way to set tuner, for example, you could set {tunerName, optimizationMode} or {tunerCommand, tunerCwd}, and you could not set them both. + Note: users could only specify one way to set tuner, for example, set {tunerName, optimizationMode} or {tunerCommand, tunerCwd}, and could not set them both. * __assessor__ * Description - __assessor__ specifies the assessor algorithm you use to run an experiment, there are two kinds of ways to set assessor. One way is to use assessor provided by nni sdk, you just need to set __builtinAssessorName__ and __classArgs__. Another way is to use your own tuner file, and you need to set __codeDirectory__, __classFileName__, __className__ and __classArgs__. + __assessor__ specifies the assessor algorithm to run an experiment, there are two kinds of ways to set assessor. One way is to use assessor provided by nni sdk, users need to set __builtinAssessorName__ and __classArgs__. Another way is to use users' own tuner file, and need to set __codeDirectory__, __classFileName__, __className__ and __classArgs__. * __builtinAssessorName__ and __classArgs__ * __builtinAssessorName__ - __builtinAssessorName__ specifies the name of system assessor you want to use, nni sdk provides four kinds of tuner, including {__TPE__, __Random__, __Anneal__, __Evolution__} + __builtinAssessorName__ specifies the name of system assessor, nni sdk provides four kinds of tuner, including {__TPE__, __Random__, __Anneal__, __Evolution__} * __classArgs__ __classArgs__ specifies the arguments of tuner algorithm @@ -218,10 +230,10 @@ machineList: __classArgs__ specifies the arguments of tuner algorithm. * __gpuNum__ - __gpuNum__ specifies the gpu number you want to use to run the assessor process. The value of this field should be a positive number. + __gpuNum__ specifies the gpu number to run the assessor process. The value of this field should be a positive number. - Note: you could only specify one way to set assessor, for example, you could set {assessorName, optimizationMode} or {assessorCommand, assessorCwd}, and you could not set them both.If you do not want to use assessor, you just need to leave assessor empty or remove assessor in your config file. Default value is 0. -* __trial__ + Note: users' could only specify one way to set assessor, for example,set {assessorName, optimizationMode} or {assessorCommand, assessorCwd}, and users could not set them both.If users do not want to use assessor, assessor fileld should leave to empty. +* __trial(local, remote)__ * __command__ __command__ specifies the command to run trial process. @@ -230,47 +242,197 @@ machineList: __codeDir__ specifies the directory of your own trial file. * __gpuNum__ - __gpuNum__ specifies the num of gpu you want to use to run your trial process. Default value is 0. + __gpuNum__ specifies the num of gpu to run the trial process. Default value is 0. + +* __trial(pai)__ + * __command__ + + __command__ specifies the command to run trial process. + * __codeDir__ + + __codeDir__ specifies the directory of the own trial file. + * __gpuNum__ + + __gpuNum__ specifies the num of gpu to run the trial process. Default value is 0. + * __cpuNum__ + + __cpuNum__ is the cpu number of cpu to be used in pai container. + * __memoryMB__ + + __memoryMB__ set the momory size to be used in pai's container. + + * __image__ + + __image__ set the image to be used in pai. + + * __dataDir__ + + __dataDir__ is the data directory in hdfs to be used. + + * __outputDir__ + + __outputDir__ is the output directory in hdfs to be used in pai, the stdout and stderr files are stored in the directory after job finished. + + + +* __trial(kubeflow)__ + + * __codeDir__ + + __codeDir__ is the local directory where the code files in. + + * __ps(optional)__ + + __ps__ is the configuration for kubeflow's tensorflow-operator. + * __replicas__ + + __replicas__ is the replica number of __ps__ role. + + * __command__ + + __command__ is the run script in __ps__'s container. + + * __gpuNum__ + + __gpuNum__ set the gpu number to be used in __ps__ container. + + * __cpuNum__ + + __cpuNum__ set the cpu number to be used in __ps__ container. + + * __memoryMB__ + + __memoryMB__ set the memory size of the container. + + * __image__ + + __iamge__ set the image to be used in __ps__. + + * __worker__ + + __worker__ is the configuration for kubeflow's tensorflow-operator. + * __replicas__ + + __replicas__ is the replica number of __worker__ role. + + * __command__ + + __command__ is the run script in __worker__'s container. + + * __gpuNum__ + + __gpuNum__ set the gpu number to be used in __worker__ container. + + * __cpuNum__ + + __cpuNum__ set the cpu number to be used in __worker__ container. + + * __memoryMB__ + + __memoryMB__ set the memory size of the container. + + * __image__ + + __iamge__ set the image to be used in __worker__. + + + * __machineList__ - __machineList__ should be set if you set __trainingServicePlatform__=remote, or it could be empty. + __machineList__ should be set if users set __trainingServicePlatform__=remote, or it could be empty. * __ip__ - __ip__ is the ip address of your remote machine. + __ip__ is the ip address of remote machine. * __port__ - __port__ is the ssh port you want to use to connect machine. + __port__ is the ssh port to be used to connect machine. - Note: if you set port empty, the default value will be 22. + Note: if users set port empty, the default value will be 22. * __username__ - __username__ is the account you use. + __username__ is the account of remote machine. * __passwd__ - __passwd__ specifies the password of your account. + __passwd__ specifies the password of the account. * __sshKeyPath__ - If you want to use ssh key to login remote machine, you could set __sshKeyPath__ in config file. __sshKeyPath__ is the path of ssh key file, which should be valid. + If users use ssh key to login remote machine, could set __sshKeyPath__ in config file. __sshKeyPath__ is the path of ssh key file, which should be valid. - Note: if you set passwd and sshKeyPath simultaneously, nni will try passwd. + Note: if users set passwd and sshKeyPath simultaneously, nni will try passwd. * __passphrase__ - __passphrase__ is used to protect ssh key, which could be empty if you don't have passphrase. + __passphrase__ is used to protect ssh key, which could be empty if users don't have passphrase. + +* __kubeflowConfig__: + + * __operator__ + + __operator__ specify the kubeflow's operator to be used, nni support __tf-operator__ in current version. + + * __nfs__ + + __server__ is the host of nfs server + + __path__ is the mounted path of nfs + + * __kubernetsServer__ + + __kubernetsServer__ set the host of kubernets service. + + * __keyVault__ + + If users want to use azure kubernets service, they should set keyVault to storage the private key of your azure storage account. Refer: https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2 + + * __vaultName__ + + __vaultName__ is the value of ```--vault-name``` used in az command. + + * __name__ + __name__ is the value of ```--name``` used in az command. +* __paiConfig__ + + * __userName__ + + __userName__ is the user name of your pai account. + + * __password__ + + __password__ is the password of the pai account. + + * __host__ + + __host__ is the host of pai. + + * __azureStorage__ + + If users use azure kubernets service, they should set azure storage account to store code files. + + * __accountName__ + + __accountName__ is the name of azure storage account. + + * __azureShare__ + + __azureShare__ is the share of the azure file storage. + + + + ## Examples * __local mode__ - If you want to run your trial jobs in your local machine, and use annotation to generate search space, you could use the following config: + If users want to run trial jobs in local machine, and use annotation to generate search space, could use the following config: ``` authorName: test experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote, pai +#choice: local, remote, pai, kubeflow trainingServicePlatform: local #choice: true, false useAnnotation: true @@ -287,14 +449,14 @@ trial: gpuNum: 0 ``` - If you want to use assessor, you could add assessor configuration in your file. + Could add assessor configuration in config file if set assessor. ``` authorName: test experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote, pai +#choice: local, remote, pai, kubeflow trainingServicePlatform: local searchSpacePath: /nni/search_space.json #choice: true, false @@ -326,7 +488,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote, pai +#choice: local, remote, pai, kubeflow trainingServicePlatform: local searchSpacePath: /nni/search_space.json #choice: true, false @@ -355,14 +517,14 @@ trial: * __remote mode__ -If you want run trial jobs in your remote machine, you could specify the remote mahcine information as fllowing format: +If run trial jobs in remote machine, users could specify the remote mahcine information as fllowing format: ``` authorName: test experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote, pai +#choice: local, remote, pai, kubeflow trainingServicePlatform: remote searchSpacePath: /nni/search_space.json #choice: true, false @@ -394,3 +556,124 @@ machineList: sshKeyPath: /nni/sshkey passphrase: qwert ``` + +* __pai mode__ + +``` +authorName: test +experimentName: nni_test1 +trialConcurrency: 1 +maxExecDuration:500h +maxTrialNum: 1 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: pai +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 main.py + codeDir: . + gpuNum: 4 + cpuNum: 2 + memoryMB: 10000 + #The docker image to run nni job on pai + image: msranni/nni:latest + #The hdfs directory to store data on pai, format 'hdfs://host:port/directory' + dataDir: hdfs://10.11.12.13:9000/test + #The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory' + outputDir: hdfs://10.11.12.13:9000/test +paiConfig: + #The username to login pai + userName: test + #The password to login pai + passWord: test + #The host of restful server of pai + host: 10.10.10.10 + +``` + +* __kubeflow mode__ + +kubeflow use nfs as storage. + +``` +authorName: default +experimentName: example_mni +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 1 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: kubeflow +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + codeDir: . + worker: + replicas: 1 + command: python3 mnist.py + gpuNum: 0 + cpuNum: 1 + memoryMB: 8192 + image: msranni/nni:latest +kubeflowConfig: + operator: tf-operator + nfs: + server: 10.10.10.10 + path: /var/nfs/general +``` +kubeflow use azure storage +``` +authorName: default +experimentName: example_mni +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 1 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: kubeflow +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +#nniManagerIp: 10.10.10.10 +tuner: + #choice: TPE, Random, Anneal, Evolution + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +assessor: + builtinAssessorName: Medianstop + classArgs: + optimize_mode: maximize + gpuNum: 0 +trial: + codeDir: . + worker: + replicas: 1 + command: python3 mnist.py + gpuNum: 0 + cpuNum: 1 + memoryMB: 4096 + image: msranni/nni:latest +kubeflowConfig: + operator: tf-operator + keyVault: + vaultName: Contoso-Vault + name: AzureStorageAccountKey + azureStorage: + accountName: storage + azureShare: share01 +``` diff --git a/docs/KubeflowMode.md b/docs/KubeflowMode.md index b3c10fb5fa..b392a9b7d7 100644 --- a/docs/KubeflowMode.md +++ b/docs/KubeflowMode.md @@ -10,6 +10,14 @@ Now NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/ku 5. An **NFS** server and export a general purpose mount (we recommend to map your NFS server path in root_squash option. Refer this [page](https://linux.die.net/man/5/exports) to learn what root_squash option is), or **Azure File Storage**. 6. Install **NNI**, follow the install guide [here](GetStarted.md). +## Prerequisite for Azure Kubernets Service +1. NNI support kubeflow based on Azure Kubernets Service, follow the [guideline](https://azure.microsoft.com/en-us/services/kubernetes-service/) to set up Azure Kubernets Service. +2. Deploy kubeflow on Azure Kubernets Service. +3. Install __kubectl__ and [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest). Connect kubectl client to Azure K8S, and use `az login` to set azure account. +4. Follow the [guideline](https://docs.microsoft.com/en-us/azure/storage/common/storage-quickstart-create-account?tabs=portal) to create azure file storage account. If you use Azure Kubernets Service, nni need Azure Storage Service to store code files and the output files. +5. Set up Azure Key Vault Service, add a secret to Key Vault +to store the private key of Azure account. + ## Design TODO @@ -56,6 +64,18 @@ kubeflowConfig: path: {your_nfs_server_exported_path} kubernetesServer: {your_kubernetes_api_server_ip} ``` +If you use Azure Kubernets Service, you should set `kubeflowConfig` in your config yaml file as follows: +``` +kubeflowConfig: + operator: tf-operator + keyVault: + vaultName: {your_vault_name} + name: {your_secert_name} + azureStorage: + accountName: {your_storage_account_name} + azureShare: {your_azure_share_name} +``` + Note: You should explicitly set `trainingServicePlatform: kubeflow` in nni config yaml file if you want to start experiment in kubeflow mode. Trial configuration in kubeflow mode have the following configuration keys: diff --git a/examples/trials/mnist-annotation/config_kubeflow.yml b/examples/trials/mnist-annotation/config_kubeflow.yml new file mode 100644 index 0000000000..160c15bd1e --- /dev/null +++ b/examples/trials/mnist-annotation/config_kubeflow.yml @@ -0,0 +1,29 @@ +authorName: default +experimentName: example_dist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 1 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: kubeflow +#choice: true, false +useAnnotation: true +tuner: + #choice: TPE, Random, Anneal, Evolution + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + codeDir: . + worker: + replicas: 1 + command: python3 mnist.py + gpuNum: 0 + cpuNum: 1 + memoryMB: 8192 + image: msranni/nni:latest +kubeflowConfig: + operator: tf-operator + nfs: + server: 10.10.10.10 + path: /var/nfs/general \ No newline at end of file diff --git a/examples/trials/mnist-smartparam/config_kubeflow.yml b/examples/trials/mnist-smartparam/config_kubeflow.yml new file mode 100644 index 0000000000..0bfab2cb5a --- /dev/null +++ b/examples/trials/mnist-smartparam/config_kubeflow.yml @@ -0,0 +1,29 @@ +authorName: default +experimentName: example_dist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: kubeflow +#choice: true, false +useAnnotation: true +tuner: + #choice: TPE, Random, Anneal, Evolution + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + codeDir: . + worker: + replicas: 1 + command: python3 mnist.py + gpuNum: 0 + cpuNum: 1 + memoryMB: 8192 + image: msranni/nni:latest +kubeflowConfig: + operator: tf-operator + nfs: + server: 10.10.10.10 + path: /var/nfs/general \ No newline at end of file diff --git a/examples/trials/mnist/config_kubeflow.yml b/examples/trials/mnist/config_kubeflow.yml new file mode 100644 index 0000000000..2729b90721 --- /dev/null +++ b/examples/trials/mnist/config_kubeflow.yml @@ -0,0 +1,30 @@ +authorName: default +experimentName: example_dist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 1 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: kubeflow +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + codeDir: . + worker: + replicas: 1 + command: python3 mnist.py + gpuNum: 0 + cpuNum: 1 + memoryMB: 8192 + image: msranni/nni:latest +kubeflowConfig: + operator: tf-operator + nfs: + server: 10.10.10.10 + path: /var/nfs/general \ No newline at end of file