diff --git a/.travis.yml b/.travis.yml index 7cc4d8c0ba..bacd0768dc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,14 +4,14 @@ language: python python: - "3.6" before_install: - - wget https://nodejs.org/dist/v10.9.0/node-v10.9.0-linux-x64.tar.xz - - tar xf node-v10.9.0-linux-x64.tar.xz - - sudo mv node-v10.9.0-linux-x64 /usr/local/node + - wget https://nodejs.org/dist/v10.10.0/node-v10.10.0-linux-x64.tar.xz + - tar xf node-v10.10.0-linux-x64.tar.xz + - sudo mv node-v10.10.0-linux-x64 /usr/local/node - export PATH=/usr/local/node/bin:$PATH - sudo sh -c 'PATH=/usr/local/node/bin:$PATH yarn global add serve' install: - make - - make install + - make dev-install - export PATH=$HOME/.nni/bin:$PATH before_script: - cd test/naive diff --git a/Makefile b/Makefile index f429fc20bb..ad6baf71c2 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ else # is normal user endif ## Dependency information -NODE_VERSION ?= v10.9.0 +NODE_VERSION ?= v10.10.0 NODE_TARBALL ?= node-$(NODE_VERSION)-linux-x64.tar.xz NODE_PATH ?= $(INSTALL_PREFIX)/nni/node @@ -294,7 +294,7 @@ ifdef _ROOT $(error You should not develop NNI as root) endif ifdef _MISS_DEPS - $(error Please install Node.js, Yarn, and Serve to develop NNI) +# $(error Please install Node.js, Yarn, and Serve to develop NNI) endif #$(_INFO) Pass! $(_END) diff --git a/README.md b/README.md index 51678149a1..3bdf6f5d1b 100644 --- a/README.md +++ b/README.md @@ -26,31 +26,40 @@ The tool dispatches and runs trial jobs that generated by tuning algorithms to s * As a researcher and data scientist, you want to implement your own AutoML algorithms and compare with other algorithms * As a ML platform owner, you want to support AutoML in your platform -# Getting Started with NNI +# Get Started with NNI ## **Installation** -Install through python pip. (the current version only supports linux, nni on ubuntu 16.04 or newer has been well tested) -* requirements: python >= 3.5, git, wget +pip Installation Prerequisites +* linux (ubuntu 16.04 or newer version has been well tested) +* python >= 3.5 +* git, wget + ``` pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.1 source ~/.bashrc ``` +## **Quick start: run your first experiment at local** +It only requires 3 steps to start an experiment on NNI: +![](./docs/3_steps.jpg) + + +NNI provides a set of examples in the package to get you familiar with the above process. In the following example [/examples/trials/mnist], we had already set up the configuration and updated the training codes for you. You can directly run the following command to start an experiment. -## **Quick start: run an experiment at local** -Requirements: -* NNI installed on your local machine -* tensorflow installed +**NOTE**: The following example is an experiment built on TensorFlow, make sure you have **TensorFlow installed** before running the following command. -Run the following command to create an experiment for [mnist] +Try it out: ```bash - nnictl create --config ~/nni/examples/trials/mnist-annotation/config.yml + nnictl create --config ~/nni/examples/trials/mnist/config.yml ``` -This command will start an experiment and a WebUI. The WebUI endpoint will be shown in the output of this command (for example, `http://localhost:8080`). Open this URL in your browser. You can analyze your experiment through WebUI, or browse trials' tensorboard. + +In the command output, find out the **Web UI url** and open it in your browser. You can analyze your experiment through WebUI, or browse trials' tensorboard. + +To learn more about how this example was constructed and how to analyze the experiement results in NNI Web UI, please refer to [How to write a trial run on NNI (MNIST as an example)?](docs/WriteYourTrial.md) ## **Please refer to [Get Started Tutorial](docs/GetStarted.md) for more detailed information.** ## More tutorials -* [How to write a trial running on NNI (Mnist as an example)?](docs/WriteYourTrial.md) + * [Tutorial of NNI python annotation.](tools/nni_annotation/README.md) * [Tuners supported by NNI.](src/sdk/pynni/nni/README.md) * [How to enable early stop (i.e. assessor) in an experiment?](docs/EnableAssessor.md) diff --git a/deployment/Dockerfile.build.base b/deployment/Dockerfile.build.base index 8fd7bf69aa..1380ccf3c9 100644 --- a/deployment/Dockerfile.build.base +++ b/deployment/Dockerfile.build.base @@ -40,7 +40,7 @@ RUN pip3 --no-cache-dir install \ numpy==1.14.3 scipy==1.1.0 # -#Install node 10.9.0, yarn 1.9.4, NNI v0.1 +#Install node 10.10.0, yarn 1.9.4, NNI v0.1 # RUN git clone -b v0.1 https://github.com/Microsoft/nni.git RUN cd nni && sh install.sh diff --git a/docs/3_steps.jpg b/docs/3_steps.jpg new file mode 100644 index 0000000000..e5e18540ea Binary files /dev/null and b/docs/3_steps.jpg differ diff --git a/docs/GetStarted.md b/docs/GetStarted.md index 5239c1aa5e..9f1affe592 100644 --- a/docs/GetStarted.md +++ b/docs/GetStarted.md @@ -1,14 +1,16 @@ -**Getting Started with NNI** +**Get Started with NNI** === ## **Installation** * __Dependencies__ python >= 3.5 + git + wget python pip should also be correctly installed. You could use "which pip" or "pip -V" to check in Linux. - * Note: For now, we don't support virtual environment. + * Note: we don't support virtual environment in current releases. * __Install NNI through pip__ diff --git a/docs/HowToContribute.md b/docs/HowToContribute.md new file mode 100644 index 0000000000..87a570e182 --- /dev/null +++ b/docs/HowToContribute.md @@ -0,0 +1,53 @@ +**How to contribute** +=== +## Best practice for debug NNI source code + +For debugging NNI source code, your development environment should be under Ubuntu 16.04 (or above) system with python 3 and pip 3 installed, then follow the below steps. + +**1. Clone the source code** + +Run the command +``` +git clone https://github.com/Microsoft/nni.git +``` +to clone the source code + +**2. Prepare the debug environment and install dependencies** + +Change directory to the source code folder, then run the command +``` +make install-dependencies +``` +to install the dependent tools for the environment + +**3. Build source code** + +Run the command +``` +make build +``` +to build the source code + +**4. Install NNI to development environment** + +Run the command +``` +make dev-install +``` +to install the distribution content to development environment, and create cli scripts + +**5. Check if the environment is ready** + +Now, you can try to start an experiment to check if your environment is ready +For example, run the command +``` +nnictl create --config ~/nni/examples/trials/mnist/config.yml +``` +And open web ui to check if everything is OK + +**6. Redeploy** + +After you change some code, just use **step 4** to rebuild your code, then the change will take effect immediately + +--- +At last, wish you have a wonderful day. \ No newline at end of file diff --git a/docs/ToContribute.md b/docs/ToContribute.md deleted file mode 100644 index b19602ed7e..0000000000 --- a/docs/ToContribute.md +++ /dev/null @@ -1,3 +0,0 @@ -## How to contribute - -TBD \ No newline at end of file diff --git a/docs/WriteYourTrial.md b/docs/WriteYourTrial.md index 82dfe3b1d1..18388aa9fd 100644 --- a/docs/WriteYourTrial.md +++ b/docs/WriteYourTrial.md @@ -1,9 +1,14 @@ -**Write a Trial which can Run on NNI** +**Write a Trial Run on NNI** === -There would be only a few changes on your existing trial(model) code to make the code runnable on NNI. We provide two approaches for you to modify your code: `Python annotation` and `NNI APIs for trial` -## NNI APIs -We also support NNI APIs for trial code. By using this approach, you should first prepare a search space file. An example is shown below: +A **Trial** in NNI is an individual attempt at applying a set of parameters on a model. + +To define a NNI trial, you need to firstly define the set of parameters and then update the model. NNI provide two approaches for you to define a trial: `NNI API` and `NNI Python annotation`. + +## NNI API +>Step 1 - Prepare a SearchSpace parameters file. + +An example is shown below: ``` { "dropout_rate":{"_type":"uniform","_value":[0.1,0.5]}, @@ -12,32 +17,71 @@ We also support NNI APIs for trial code. By using this approach, you should firs "learning_rate":{"_type":"uniform","_value":[0.0001, 0.1]} } ``` -You can refer to [here](SearchSpaceSpec.md) for the tutorial of search space. +Refer to [SearchSpaceSpec.md](SearchSpaceSpec.md) to learn more about search space. -Then, include `import nni` in your trial code to use NNI APIs. Using the line: -``` -RECEIVED_PARAMS = nni.get_parameters() -``` -to get hyper-parameters' values assigned by tuner. `RECEIVED_PARAMS` is an object, for example: -``` -{"conv_size": 2, "hidden_size": 124, "learning_rate": 0.0307, "dropout_rate": 0.2029} -``` +>Step 2 - Update model codes +~~~~ +2.1 Declare NNI API + Include `import nni` in your trial code to use NNI APIs. + +2.2 Get predefined parameters + Use the following code snippet: + + RECEIVED_PARAMS = nni.get_parameters() + + to get hyper-parameters' values assigned by tuner. `RECEIVED_PARAMS` is an object, for example: + + {"conv_size": 2, "hidden_size": 124, "learning_rate": 0.0307, "dropout_rate": 0.2029} + +2.3 Report NNI results + Use the API: -On the other hand, you can use the API: `nni.report_intermediate_result(accuracy)` to send `accuracy` to assessor. And use `nni.report_final_result(accuracy)` to send `accuracy` to tuner. Here `accuracy` could be any python data type, but **NOTE that if you use built-in tuner/assessor, `accuracy` should be a numerical variable(e.g. float, int)**. + `nni.report_intermediate_result(accuracy)` + + to send `accuracy` to assessor. + + Use the API: -The assessor will decide which trial should early stop based on the history performance of trial(intermediate result of one trial). -The tuner will generate next parameters/architecture based on the explore history(final result of all trials). + `nni.report_final_result(accuracy)` + + to send `accuracy` to tuner. +~~~~ + +**NOTE**: +~~~~ +accuracy - The `accuracy` could be any python object, but if you use NNI built-in tuner/assessor, `accuracy` should be a numerical variable (e.g. float, int). +assessor - The assessor will decide which trial should early stop based on the history performance of trial (intermediate result of one trial). +tuner - The tuner will generate next parameters/architecture based on the explore history (final result of all trials). +~~~~ + +>Step 3 - Enable NNI API + +To enable NNI API mode, you need to set useAnnotation to *false* and provide the path of SearchSpace file (you just defined in step 1): -In the yaml configure file, you need two lines to enable NNI APIs: ``` useAnnotation: false searchSpacePath: /path/to/your/search_space.json ``` -You can refer to [here](../examples/trials/README.md) for more information about how to write trial code using NNI APIs. +You can refer to [here](ExperimentConfig.md) for more information about how to set up experiment configurations. + +(../examples/trials/README.md) for more information about how to write trial code using NNI APIs. + +## NNI Python Annotation +An alternative to write a trial is to use NNI's syntax for python. Simple as any annotation, NNI annotation is working like comments in your codes. You don't have to make structure or any other big changes to your existing codes. With a few lines of NNI annotation, you will be able to: +* annotate the variables you want to tune +* specify in which range you want to tune the variables +* annotate which variable you want to report as intermediate result to `assessor` +* annotate which variable you want to report as the final result (e.g. model accuracy) to `tuner`. + +Again, take MNIST as an example, it only requires 2 steps to write a trial with NNI Annotation. + +>Step 1 - Update codes with annotations + +Please refer the following tensorflow code snippet for NNI Annotation, the highlighted 4 lines are annotations that help you to: (1) tune batch\_size and (2) dropout\_rate, (3) report test\_acc every 100 steps, and (4) at last report test\_acc as final result. + +>What noteworthy is: as these new added codes are annotations, it does not actually change your previous codes logic, you can still run your code as usual in environments without NNI installed. -## NNI Annotation -We designed a new syntax for users to annotate the variables they want to tune and in what range they want to tune the variables. Also, they can annotate which variable they want to report as intermediate result to `assessor`, and which variable to report as the final result (e.g. model accuracy) to `tuner`. A really appealing feature of our NNI annotation is that it exists as comments in your code, which means you can run your code as before without NNI. Let's look at an example, below is a piece of tensorflow code: ```diff with tf.Session() as sess: sess.run(tf.global_variables_initializer()) @@ -64,14 +108,16 @@ with tf.Session() as sess: + """@nni.report_final_result(test_acc)""" ``` -Let's say you want to tune batch\_size and dropout\_rate, and report test\_acc every 100 steps, at last report test\_acc as final result. With our NNI annotation, your code would look like below: +>NOTE +>>`@nni.variable` will take effect on its following line +>> +>>`@nni.report_intermediate_result`/`@nni.report_final_result` will send the data to assessor/tuner at that line. +>> +>>Please refer to [Annotation README](../tools/annotation/README.md) for more information about annotation syntax and its usage. -Simply adding four lines would make your code runnable on NNI. You can still run your code independently. `@nni.variable` works on its next line assignment, and `@nni.report_intermediate_result`/`@nni.report_final_result` would send the data to assessor/tuner at that line. Please refer to [here](../tools/annotation/README.md) for more annotation syntax and more powerful usage. In the yaml configure file, you need one line to enable NNI annotation: +>Step 2 - Enable NNI Annotation +In the yaml configure file, you need to set *useAnnotation* to true to enable NNI annotation: ``` useAnnotation: true ``` - -For users to correctly leverage NNI annotation, we briefly introduce how NNI annotation works here: NNI precompiles users' trial code to find all the annotations each of which is one line with `"""@nni` at the head of the line. Then NNI replaces each annotation with a corresponding NNI API at the location where the annotation is. - -**Note that: in your trial code, you can use either one of NNI APIs and NNI annotation, but not both of them simultaneously.** \ No newline at end of file diff --git a/examples/trials/ga_squad/README.md b/examples/trials/ga_squad/README.md new file mode 100644 index 0000000000..ab8ba853f7 --- /dev/null +++ b/examples/trials/ga_squad/README.md @@ -0,0 +1,254 @@ +# Automatic Model Architecture Search for Reading Comprehension +This example shows us how to use Genetic Algorithm to find good model architectures for Reading Comprehension task. + +## Search Space +Since attention and recurrent neural network (RNN) module have been proven effective in Reading Comprehension. +We conclude the search space as follow: + +1. IDENTITY (Effectively means keep training). +2. INSERT-RNN-LAYER (Inserts a LSTM. Comparing the performance of GRU and LSTM in our experiment, we decided to use LSTM here.) +3. REMOVE-RNN-LAYER +4. INSERT-ATTENTION-LAYER(Inserts a attention layer.) +5. REMOVE-ATTENTION-LAYER +6. ADD-SKIP (Identity between random layers). +7. REMOVE-SKIP (Removes random skip). + +![ga-squad-logo](./ga_squad.png) + +## New version +Also we have another version which time cost is less and performance is better. We will release soon. + +# How to run this example? + +## Download data + +### Use downloading script to download data + +Execute the following command to download needed files +using the downloading script: + +``` +chmod +x ./download.sh +./download.sh +``` + +### Download manually + +1. download "dev-v1.1.json" and "train-v1.1.json" in https://rajpurkar.github.io/SQuAD-explorer/ + +``` +wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json +wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json +``` + +2. download "glove.840B.300d.txt" in https://nlp.stanford.edu/projects/glove/ + +``` +wget http://nlp.stanford.edu/data/glove.840B.300d.zip +unzip glove.840B.300d.zip +``` + +## Update configuration +Modify `nni/examples/trials/ga_squad/config.yaml`, here is the default configuration: + +``` +authorName: default +experimentName: example_ga_squad +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 1 +#choice: local, remote +trainingServicePlatform: local +#choice: true, false +useAnnotation: false +tuner: + codeDir: ~/nni/examples/tuners/ga_customer_tuner + classFileName: customer_tuner.py + className: CustomerTuner + classArgs: + optimize_mode: maximize +trial: + command: python3 trial.py + codeDir: ~/nni/examples/trials/ga_squad + gpuNum: 0 +``` + +In the "trial" part, if you want to use GPU to perform the architecture search, change `gpuNum` from `0` to `1`. You need to increase the `maxTrialNum` and `maxExecDuration`, according to how long you want to wait for the search result. + +`trialConcurrency` is the number of trials running concurrently, which is the number of GPUs you want to use, if you are setting `gpuNum` to 1. + +## submit this job + +``` +nnictl create --config ~/nni/examples/trials/ga_squad/config.yaml +``` + +# Techinal details about the trial + +## How does it works +The evolution-algorithm based architecture for question answering has two different parts just like any other examples: the trial and the tuner. + +### The trial + +The trial has a lot of different files, functions and classes. Here we will only give most of those files a brief introduction: + +* `attention.py` contains an implementaion for attention mechanism in Tensorflow. +* `data.py` contains functions for data preprocessing. +* `evaluate.py` contains the evaluation script. +* `graph.py` contains the definition of the computation graph. +* `rnn.py` contains an implementaion for GRU in Tensorflow. +* `train_model.py` is a wrapper for the whole question answering model. + +Among those files, `trial.py` and `graph_to_tf.py` is special. + +`graph_to_tf.py` has a function named as `graph_to_network`, here is its skelton code: + +``` +def graph_to_network(input1, + input2, + input1_lengths, + input2_lengths, + graph, + dropout_rate, + is_training, + num_heads=1, + rnn_units=256): + topology = graph.is_topology() + layers = dict() + layers_sequence_lengths = dict() + num_units = input1.get_shape().as_list()[-1] + layers[0] = input1*tf.sqrt(tf.cast(num_units, tf.float32)) + \ + positional_encoding(input1, scale=False, zero_pad=False) + layers[1] = input2*tf.sqrt(tf.cast(num_units, tf.float32)) + layers[0] = dropout(layers[0], dropout_rate, is_training) + layers[1] = dropout(layers[1], dropout_rate, is_training) + layers_sequence_lengths[0] = input1_lengths + layers_sequence_lengths[1] = input2_lengths + for _, topo_i in enumerate(topology): + if topo_i == '|': + continue + if graph.layers[topo_i].graph_type == LayerType.input.value: + # ...... + elif graph.layers[topo_i].graph_type == LayerType.attention.value: + # ...... + # More layers to handle +``` + +As we can see, this function is actually a compiler, that converts the internal model DAG configuration (which will be introduced in the `Model configuration format` section) `graph`, to a Tensorflow computation graph. + +``` +topology = graph.is_topology() +``` + +performs topological sorting on the internal graph representation, and the code inside the loop: + +``` +for _, topo_i in enumerate(topology): +``` + +performs actually conversion that maps each layer to a part in Tensorflow computation graph. + +### The tuner + +The tuner is much more simple than the trial. They actually share the same `graph.py`. Besides, the tuner has a `customer_tuner.py`, the most important class in which is `CustomerTuner`: + +``` +class CustomerTuner(Tuner): + # ...... + + def generate_parameters(self, parameter_id): + """Returns a set of trial graph config, as a serializable object. + parameter_id : int + """ + if len(self.population) <= 0: + logger.debug("the len of poplution lower than zero.") + raise Exception('The population is empty') + pos = -1 + for i in range(len(self.population)): + if self.population[i].result == None: + pos = i + break + if pos != -1: + indiv = copy.deepcopy(self.population[pos]) + self.population.pop(pos) + temp = json.loads(graph_dumps(indiv.config)) + else: + random.shuffle(self.population) + if self.population[0].result > self.population[1].result: + self.population[0] = self.population[1] + indiv = copy.deepcopy(self.population[0]) + self.population.pop(1) + indiv.mutation() + graph = indiv.config + temp = json.loads(graph_dumps(graph)) + + # ...... +``` + +As we can see, the overloaded method `generate_parameters` implements a pretty naive mutation algorithm. The code lines: + +``` + if self.population[0].result > self.population[1].result: + self.population[0] = self.population[1] + indiv = copy.deepcopy(self.population[0]) +``` + +controls the mutation process. It will always take two random individuals in the population, only keeping and mutating the one with better result. + +## Model configuration format + +Here is an example of the model configuration, which is passed from the tuner to the trial in the architecture search procedure. + +``` +{ + "max_layer_num": 50, + "layers": [ + { + "input_size": 0, + "type": 3, + "output_size": 1, + "input": [], + "size": "x", + "output": [4, 5], + "is_delete": false + }, + { + "input_size": 0, + "type": 3, + "output_size": 1, + "input": [], + "size": "y", + "output": [4, 5], + "is_delete": false + }, + { + "input_size": 1, + "type": 4, + "output_size": 0, + "input": [6], + "size": "x", + "output": [], + "is_delete": false + }, + { + "input_size": 1, + "type": 4, + "output_size": 0, + "input": [5], + "size": "y", + "output": [], + "is_delete": false + }, + {"Comment": "More layers will be here for actual graphs."} + ] +} +``` + +Every model configuration will has a "layers" section, which is a JSON list of layer definitions. The definition of each layer is also a JSON object, where: + + * `type` is the type of the layer. 0, 1, 2, 3, 4 corresponde to attention, self-attention, RNN, input and output layer respectively. + * `size` is the length of the output. "x", "y" corresponde to document length / question length, respectively. + * `input_size` is the number of inputs the layer has. + * `input` is the indices of layers taken as input of this layer. + * `output` is the indices of layers use this layer's output as their input. + * `is_delete` means whether the layer is still available. \ No newline at end of file diff --git a/examples/trials/ga_squad/ga_squad.png b/examples/trials/ga_squad/ga_squad.png new file mode 100644 index 0000000000..4c82cd4654 Binary files /dev/null and b/examples/trials/ga_squad/ga_squad.png differ diff --git a/examples/trials/ga_squad/readme.md b/examples/trials/ga_squad/readme.md deleted file mode 100644 index 99eaf12fd5..0000000000 --- a/examples/trials/ga_squad/readme.md +++ /dev/null @@ -1,33 +0,0 @@ -# Download data - -## Use downloading script - -Execute the following command to download needed files -using the downloading script: - -``` -chmod +x ./download.sh -./download.sh -``` - -## Download manually - -1. download "dev-v1.1.json" and "train-v1.1.json" in https://rajpurkar.github.io/SQuAD-explorer/ - -``` -wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -``` - -2. download "glove.840B.300d.txt" in https://nlp.stanford.edu/projects/glove/ - -``` -wget http://nlp.stanford.edu/data/glove.840B.300d.zip -unzip glove.840B.300d.zip -``` - -# How to submit this job - -1. run "$NNI_ROOT_DIR/auto_run.py" as "$NNI_ROOT_DIR/README-AUTO.md" said. -2. use the dockerImage openpai.azurecr.io/nni_v0.0.1, which means it use a tensorflow cpu-version. -3. this model don't need search_space.json. \ No newline at end of file diff --git a/examples/trials/mnist/config.yml b/examples/trials/mnist/config.yml index 331afab2b9..abb5a48db6 100644 --- a/examples/trials/mnist/config.yml +++ b/examples/trials/mnist/config.yml @@ -2,7 +2,7 @@ authorName: default experimentName: example_mnist trialConcurrency: 1 maxExecDuration: 1h -maxTrialNum: 1 +maxTrialNum: 100 #choice: local, remote trainingServicePlatform: local searchSpacePath: ~/nni/examples/trials/mnist/search_space.json diff --git a/setup.py b/setup.py index eeee54d075..25997c78c8 100644 --- a/setup.py +++ b/setup.py @@ -81,16 +81,10 @@ def run(self): 'pyyaml', 'requests', 'scipy', - 'schema' - ], - dependency_links = [ - 'git+https://github.com/hyperopt/hyperopt.git' + 'schema' ], cmdclass={ 'install': CustomInstallCommand - }, - entry_points={ - 'console_scripts': ['nnictl = nnicmd.nnictl:parse_args'] } ) diff --git a/src/nni_manager/common/datastore.ts b/src/nni_manager/common/datastore.ts index b86b0a95fe..7ed2328d7d 100644 --- a/src/nni_manager/common/datastore.ts +++ b/src/nni_manager/common/datastore.ts @@ -22,7 +22,7 @@ import { ExperimentProfile, TrialJobStatistics } from './manager'; import { TrialJobDetail, TrialJobStatus } from './trainingService'; -type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED'; +type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED' | 'ADD_HYPERPARAMETER'; type MetricType = 'PERIODICAL' | 'FINAL' | 'CUSTOM'; interface ExperimentProfileRecord { @@ -62,7 +62,7 @@ interface TrialJobInfo { status: TrialJobStatus; startTime?: number; endTime?: number; - hyperParameters?: string; + hyperParameters?: string[]; logPath?: string; finalMetricData?: string; stderrPath?: string; diff --git a/src/nni_manager/common/log.ts b/src/nni_manager/common/log.ts index da1c4e5c0b..0b3945746b 100644 --- a/src/nni_manager/common/log.ts +++ b/src/nni_manager/common/log.ts @@ -40,7 +40,7 @@ class BufferSerialEmitter { private writable: Writable; constructor(writable: Writable) { - this.buffer = new Buffer(0); + this.buffer = Buffer.alloc(0); this.emitting = false; this.writable = writable; } @@ -61,7 +61,7 @@ class BufferSerialEmitter { this.emit(); } }); - this.buffer = new Buffer(0); + this.buffer = Buffer.alloc(0); } } diff --git a/src/nni_manager/common/manager.ts b/src/nni_manager/common/manager.ts index 10fb9a4227..1d02a1775d 100644 --- a/src/nni_manager/common/manager.ts +++ b/src/nni_manager/common/manager.ts @@ -31,6 +31,7 @@ interface ExperimentParams { maxExecDuration: number; //seconds maxTrialNum: number; searchSpace: string; + multiPhase?: boolean; tuner: { className: string; builtinTunerName?: string; diff --git a/src/nni_manager/common/trainingService.ts b/src/nni_manager/common/trainingService.ts index 0b8708394c..7bcc575c34 100644 --- a/src/nni_manager/common/trainingService.ts +++ b/src/nni_manager/common/trainingService.ts @@ -37,11 +37,16 @@ interface JobApplicationForm { readonly jobType: JobType; } +interface HyperParameters { + readonly value: string; + readonly index: number; +} + /** * define TrialJobApplicationForm */ interface TrialJobApplicationForm extends JobApplicationForm { - readonly hyperParameters: string; + readonly hyperParameters: HyperParameters; } /** @@ -116,6 +121,6 @@ abstract class TrainingService { export { TrainingService, TrainingServiceError, TrialJobStatus, TrialJobApplicationForm, - TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, + TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters, HostJobApplicationForm, JobApplicationForm, JobType }; diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index ba0650ef28..1356d0347b 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -158,8 +158,11 @@ function parseArg(names: string[]): string { * @param assessor: similiar as tuner * */ -function getMsgDispatcherCommand(tuner: any, assessor: any): string { +function getMsgDispatcherCommand(tuner: any, assessor: any, multiPhase: boolean = false): string { let command: string = `python3 -m nni --tuner_class_name ${tuner.className}`; + if (multiPhase) { + command += ' --multi_phase'; + } if (process.env.VIRTUAL_ENV) { command = path.join(process.env.VIRTUAL_ENV, 'bin/') +command; diff --git a/src/nni_manager/core/commands.ts b/src/nni_manager/core/commands.ts index 37bba31d9e..19204b2f31 100644 --- a/src/nni_manager/core/commands.ts +++ b/src/nni_manager/core/commands.ts @@ -27,6 +27,7 @@ const TRIAL_END = 'EN'; const TERMINATE = 'TE'; const NEW_TRIAL_JOB = 'TR'; +const SEND_TRIAL_JOB_PARAMETER = 'SP'; const NO_MORE_TRIAL_JOBS = 'NO'; const KILL_TRIAL_JOB = 'KI'; @@ -39,6 +40,7 @@ const TUNER_COMMANDS: Set = new Set([ TERMINATE, NEW_TRIAL_JOB, + SEND_TRIAL_JOB_PARAMETER, NO_MORE_TRIAL_JOBS ]); @@ -63,5 +65,6 @@ export { NO_MORE_TRIAL_JOBS, KILL_TRIAL_JOB, TUNER_COMMANDS, - ASSESSOR_COMMANDS + ASSESSOR_COMMANDS, + SEND_TRIAL_JOB_PARAMETER }; diff --git a/src/nni_manager/core/nniDataStore.ts b/src/nni_manager/core/nniDataStore.ts index 47c2f01dc3..1beec632be 100644 --- a/src/nni_manager/core/nniDataStore.ts +++ b/src/nni_manager/core/nniDataStore.ts @@ -118,6 +118,7 @@ class NNIDataStore implements DataStore { } public async storeMetricData(trialJobId: string, data: string): Promise { + this.log.debug(`storeMetricData: trialJobId: ${trialJobId}, data: ${data}`); const metrics = JSON.parse(data) as MetricData; assert(trialJobId === metrics.trial_job_id); await this.db.storeMetricData(trialJobId, JSON.stringify({ @@ -168,18 +169,34 @@ class NNIDataStore implements DataStore { } } - private getJobStatusByLatestEvent(event: TrialJobEvent): TrialJobStatus { + private getJobStatusByLatestEvent(oldStatus: TrialJobStatus, event: TrialJobEvent): TrialJobStatus { switch (event) { case 'USER_TO_CANCEL': return 'USER_CANCELED'; case 'ADD_CUSTOMIZED': return 'WAITING'; + case 'ADD_HYPERPARAMETER': + return oldStatus; default: } return event; } + private mergeHyperParameters(hyperParamList: string[], newParamStr: string): string[] { + const mergedHyperParams: any[] = []; + const newParam: any = JSON.parse(newParamStr); + for (const hyperParamStr of hyperParamList) { + const hyperParam: any = JSON.parse(hyperParamStr); + mergedHyperParams.push(hyperParam); + } + if (mergedHyperParams.filter((value: any) => { return value.parameter_index === newParam.parameter_index; }).length <= 0) { + mergedHyperParams.push(newParam); + } + + return mergedHyperParams.map((value: any) => { return JSON.stringify(value); }); + } + private getTrialJobsByReplayEvents(trialJobEvents: TrialJobEventRecord[]): Map { const map: Map = new Map(); // assume data is stored by time ASC order @@ -193,7 +210,8 @@ class NNIDataStore implements DataStore { } else { jobInfo = { id: record.trialJobId, - status: this.getJobStatusByLatestEvent(record.event) + status: this.getJobStatusByLatestEvent('UNKNOWN', record.event), + hyperParameters: [] }; } if (!jobInfo) { @@ -222,9 +240,13 @@ class NNIDataStore implements DataStore { } default: } - jobInfo.status = this.getJobStatusByLatestEvent(record.event); + jobInfo.status = this.getJobStatusByLatestEvent(jobInfo.status, record.event); if (record.data !== undefined && record.data.trim().length > 0) { - jobInfo.hyperParameters = record.data; + if (jobInfo.hyperParameters !== undefined) { + jobInfo.hyperParameters = this.mergeHyperParameters(jobInfo.hyperParameters, record.data); + } else { + assert(false, 'jobInfo.hyperParameters is undefined'); + } } map.set(record.trialJobId, jobInfo); } diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 48d9fa3c83..0647e775a3 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -37,7 +37,7 @@ import { import { delay , getLogDir, getMsgDispatcherCommand} from '../common/utils'; import { ADD_CUSTOMIZED_TRIAL_JOB, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, REPORT_METRIC_DATA, - REQUEST_TRIAL_JOBS, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE + REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE } from './commands'; import { createDispatcherInterface, IpcInterface } from './ipcInterface'; import { TrialJobMaintainerEvent, TrialJobs } from './trialJobs'; @@ -116,7 +116,7 @@ class NNIManager implements Manager { await this.storeExperimentProfile(); this.log.debug('Setup tuner...'); - const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor); + const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase); console.log(`dispatcher command: ${dispatcherCommand}`); this.setupTuner( //expParams.tuner.tunerCommand, @@ -140,7 +140,7 @@ class NNIManager implements Manager { this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId); const expParams: ExperimentParams = this.experimentProfile.params; - const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor); + const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase); console.log(`dispatcher command: ${dispatcherCommand}`); this.setupTuner( dispatcherCommand, @@ -460,7 +460,10 @@ class NNIManager implements Manager { this.currSubmittedTrialNum++; const trialJobAppForm: TrialJobApplicationForm = { jobType: 'TRIAL', - hyperParameters: content + hyperParameters: { + value: content, + index: 0 + } }; const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm); this.trialJobsMaintainer.setTrialJob(trialJobDetail.id, Object.assign({}, trialJobDetail)); @@ -472,6 +475,22 @@ class NNIManager implements Manager { } } break; + case SEND_TRIAL_JOB_PARAMETER: + const tunerCommand: any = JSON.parse(content); + assert(tunerCommand.parameter_index >= 0); + assert(tunerCommand.trial_job_id !== undefined); + + const trialJobForm: TrialJobApplicationForm = { + jobType: 'TRIAL', + hyperParameters: { + value: content, + index: tunerCommand.parameter_index + } + }; + await this.trainingService.updateTrialJob(tunerCommand.trial_job_id, trialJobForm); + await this.dataStore.storeTrialJobEvent( + 'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined); + break; case NO_MORE_TRIAL_JOBS: this.trialJobsMaintainer.setNoMoreTrials(); break; diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 218a8c22c4..a1c32d396f 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -47,6 +47,7 @@ export namespace ValidationSchemas { trialConcurrency: joi.number().min(0).required(), searchSpace: joi.string().required(), maxExecDuration: joi.number().min(0).required(), + multiPhase: joi.boolean(), tuner: joi.object({ builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution'), codeDir: joi.string(), diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 50eb9b3846..b66dd8d68c 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -30,10 +30,11 @@ import { getLogger, Logger } from '../../common/log'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { - HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm, + HostJobApplicationForm, JobApplicationForm, HyperParameters, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; import { delay, getExperimentRootDir, uniqueString } from '../../common/utils'; +import { file } from 'tmp'; const tkill = require('tree-kill'); @@ -210,8 +211,18 @@ class LocalTrainingService implements TrainingService { * @param trialJobId trial job id * @param form job application form */ - public updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise { - throw new MethodNotImplementedError(); + public async updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise { + const trialJobDetail: undefined | TrialJobDetail = this.jobMap.get(trialJobId); + if (trialJobDetail === undefined) { + throw new Error(`updateTrialJob failed: ${trialJobId} not found`); + } + if (form.jobType === 'TRIAL') { + await this.writeParameterFile(trialJobDetail.workingDirectory, (form).hyperParameters); + } else { + throw new Error(`updateTrialJob failed: jobType ${form.jobType} not supported.`); + } + + return trialJobDetail; } /** @@ -332,10 +343,7 @@ class LocalTrainingService implements TrainingService { await cpp.exec(`mkdir -p ${path.join(trialJobDetail.workingDirectory, '.nni')}`); await cpp.exec(`touch ${path.join(trialJobDetail.workingDirectory, '.nni', 'metrics')}`); await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, 'run.sh'), runScriptLines.join('\n'), { encoding: 'utf8' }); - await fs.promises.writeFile( - path.join(trialJobDetail.workingDirectory, 'parameter.cfg'), - (trialJobDetail.form).hyperParameters, - { encoding: 'utf8' }); + await this.writeParameterFile(trialJobDetail.workingDirectory, (trialJobDetail.form).hyperParameters); const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`); this.setTrialJobStatus(trialJobDetail, 'RUNNING'); @@ -402,6 +410,11 @@ class LocalTrainingService implements TrainingService { } } } + + private async writeParameterFile(directory: string, hyperParameters: HyperParameters): Promise { + const filepath: string = path.join(directory, `parameter_${hyperParameters.index}.cfg`); + await fs.promises.writeFile(filepath, hyperParameters.value, { encoding: 'utf8' }); + } } export { LocalTrainingService }; diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 772b93ff5d..6ca4552f63 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -34,7 +34,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { ObservableTimer } from '../../common/observableTimer'; import { - HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; import { delay, getExperimentRootDir, uniqueString } from '../../common/utils'; import { GPUSummary } from '../common/gpuData'; @@ -198,8 +198,24 @@ class RemoteMachineTrainingService implements TrainingService { * @param trialJobId trial job id * @param form job application form */ - public updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise { - throw new MethodNotImplementedError(); + public async updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise { + this.log.info(`updateTrialJob: form: ${JSON.stringify(form)}`); + const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId); + if (trialJobDetail === undefined) { + throw new Error(`updateTrialJob failed: ${trialJobId} not found`); + } + if (form.jobType === 'TRIAL') { + const rmMeta: RemoteMachineMeta | undefined = (trialJobDetail).rmMeta; + if (rmMeta !== undefined) { + await this.writeParameterFile(trialJobId, (form).hyperParameters, rmMeta); + } else { + throw new Error(`updateTrialJob failed: ${trialJobId} rmMeta not found`); + } + } else { + throw new Error(`updateTrialJob failed: jobType ${form.jobType} not supported.`); + } + + return trialJobDetail; } /** @@ -442,15 +458,13 @@ class RemoteMachineTrainingService implements TrainingService { //create tmp trial working folder locally. await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); - // Write file content ( run.sh and parameter.cfg ) to local tmp files + // Write file content ( run.sh and parameter_0.cfg ) to local tmp files await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run.sh'), runScriptContent, { encoding: 'utf8' }); - await fs.promises.writeFile(path.join(trialLocalTempFolder, 'parameter.cfg'), form.hyperParameters, { encoding: 'utf8' }); // Copy local tmp files to remote machine await SSHClientUtility.copyFileToRemote( path.join(trialLocalTempFolder, 'run.sh'), path.join(trialWorkingFolder, 'run.sh'), sshClient); - await SSHClientUtility.copyFileToRemote( - path.join(trialLocalTempFolder, 'parameter.cfg'), path.join(trialWorkingFolder, 'parameter.cfg'), sshClient); + await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta); // Copy files in codeDir to remote working directory await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient); @@ -562,6 +576,22 @@ class RemoteMachineTrainingService implements TrainingService { return jobpidPath; } + + private async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters, rmMeta: RemoteMachineMeta): Promise { + const sshClient: Client | undefined = this.machineSSHClientMap.get(rmMeta); + if (sshClient === undefined) { + throw new Error('sshClient is undefined.'); + } + + const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); + const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); + + const fileName: string = `parameter_${hyperParameters.index}.cfg`; + const localFilepath: string = path.join(trialLocalTempFolder, fileName); + await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' }); + + await SSHClientUtility.copyFileToRemote(localFilepath, path.join(trialWorkingFolder, fileName), sshClient); + } } export { RemoteMachineTrainingService }; diff --git a/src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts b/src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts index b55e041e1d..7509ea2ade 100644 --- a/src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts +++ b/src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts @@ -100,7 +100,10 @@ describe('Unit Test for RemoteMachineTrainingService', () => { TrialConfigMetadataKey.TRIAL_CONFIG, `{"command":"sleep 1h && echo ","codeDir":"${localCodeDir}","gpuNum":1}`); const form: TrialJobApplicationForm = { jobType: 'TRIAL', - hyperParameters: 'mock hyperparameters' + hyperParameters: { + value: 'mock hyperparameters', + index: 0 + } }; const trialJob = await remoteMachineTrainingService.submitTrialJob(form); @@ -135,7 +138,10 @@ describe('Unit Test for RemoteMachineTrainingService', () => { // submit job const form: TrialJobApplicationForm = { jobType: 'TRIAL', - hyperParameters: 'mock hyperparameters' + hyperParameters: { + value: 'mock hyperparameters', + index: 0 + } }; const jobDetail: TrialJobDetail = await remoteMachineTrainingService.submitTrialJob(form); // Add metrics listeners diff --git a/src/sdk/pynni/nni/__main__.py b/src/sdk/pynni/nni/__main__.py index e3a39bac96..cf839de87d 100644 --- a/src/sdk/pynni/nni/__main__.py +++ b/src/sdk/pynni/nni/__main__.py @@ -28,6 +28,7 @@ import importlib from nni.msg_dispatcher import MsgDispatcher +from nni.multi_phase.multi_phase_dispatcher import MultiPhaseMsgDispatcher from nni.hyperopt_tuner.hyperopt_tuner import HyperoptTuner from nni.evolution_tuner.evolution_tuner import EvolutionTuner from nni.batch_tuner.batch_tuner import BatchTuner @@ -79,6 +80,7 @@ def parse_args(): help='Assessor directory') parser.add_argument('--assessor_class_filename', type=str, required=False, help='Assessor class file path') + parser.add_argument('--multi_phase', action='store_true') flags, _ = parser.parse_known_args() return flags @@ -110,7 +112,10 @@ def main(): if tuner is None: raise AssertionError('Failed to create Tuner instance') - dispatcher = MsgDispatcher(tuner, assessor) + if args.multi_phase: + dispatcher = MultiPhaseMsgDispatcher(tuner, assessor) + else: + dispatcher = MsgDispatcher(tuner, assessor) try: dispatcher.run() diff --git a/src/sdk/pynni/nni/multi_phase/multi_phase_dispatcher.py b/src/sdk/pynni/nni/multi_phase/multi_phase_dispatcher.py new file mode 100644 index 0000000000..ec7d2be0f1 --- /dev/null +++ b/src/sdk/pynni/nni/multi_phase/multi_phase_dispatcher.py @@ -0,0 +1,178 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +# associated documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, distribute, +# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or +# substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# ================================================================================================== + +import logging +from collections import defaultdict +import json_tricks + +from nni.protocol import CommandType, send +from nni.msg_dispatcher_base import MsgDispatcherBase +from nni.assessor import AssessResult + +_logger = logging.getLogger(__name__) + +# Assessor global variables +_trial_history = defaultdict(dict) +'''key: trial job ID; value: intermediate results, mapping from sequence number to data''' + +_ended_trials = set() +'''trial_job_id of all ended trials. +We need this because NNI manager may send metrics after reporting a trial ended. +TODO: move this logic to NNI manager +''' + +def _sort_history(history): + ret = [ ] + for i, _ in enumerate(history): + if i in history: + ret.append(history[i]) + else: + break + return ret + +# Tuner global variables +_next_parameter_id = 0 +_trial_params = {} +'''key: trial job ID; value: parameters''' +_customized_parameter_ids = set() + +def _create_parameter_id(): + global _next_parameter_id # pylint: disable=global-statement + _next_parameter_id += 1 + return _next_parameter_id - 1 + +def _pack_parameter(parameter_id, params, customized=False, trial_job_id=None, parameter_index=None): + _trial_params[parameter_id] = params + ret = { + 'parameter_id': parameter_id, + 'parameter_source': 'customized' if customized else 'algorithm', + 'parameters': params + } + if trial_job_id is not None: + ret['trial_job_id'] = trial_job_id + if parameter_index is not None: + ret['parameter_index'] = parameter_index + else: + ret['parameter_index'] = 0 + return json_tricks.dumps(ret) + +class MultiPhaseMsgDispatcher(MsgDispatcherBase): + def __init__(self, tuner, assessor=None): + super() + self.tuner = tuner + self.assessor = assessor + if assessor is None: + _logger.debug('Assessor is not configured') + + def load_checkpoint(self): + self.tuner.load_checkpoint() + if self.assessor is not None: + self.assessor.load_checkpoint() + + def save_checkpoint(self): + self.tuner.save_checkpoint() + if self.assessor is not None: + self.assessor.save_checkpoint() + + def handle_request_trial_jobs(self, data): + # data: number or trial jobs + ids = [_create_parameter_id() for _ in range(data)] + params_list = self.tuner.generate_multiple_parameters(ids) + assert len(ids) == len(params_list) + for i, _ in enumerate(ids): + send(CommandType.NewTrialJob, _pack_parameter(ids[i], params_list[i])) + return True + + def handle_update_search_space(self, data): + self.tuner.update_search_space(data) + return True + + def handle_add_customized_trial(self, data): + # data: parameters + id_ = _create_parameter_id() + _customized_parameter_ids.add(id_) + send(CommandType.NewTrialJob, _pack_parameter(id_, data, customized=True)) + return True + + def handle_report_metric_data(self, data): + trial_job_id = data['trial_job_id'] + if data['type'] == 'FINAL': + id_ = data['parameter_id'] + if id_ in _customized_parameter_ids: + self.tuner.receive_customized_trial_result(id_, _trial_params[id_], data['value'], trial_job_id) + else: + self.tuner.receive_trial_result(id_, _trial_params[id_], data['value'], trial_job_id) + elif data['type'] == 'PERIODICAL': + if self.assessor is not None: + self._handle_intermediate_metric_data(data) + else: + pass + elif data['type'] == 'REQUEST_PARAMETER': + assert data['trial_job_id'] is not None + assert data['parameter_index'] is not None + param_id = _create_parameter_id() + param = self.tuner.generate_parameters(param_id, trial_job_id) + send(CommandType.SendTrialJobParameter, _pack_parameter(param_id, param, trial_job_id=data['trial_job_id'], parameter_index=data['parameter_index'])) + else: + raise ValueError('Data type not supported: {}'.format(data['type'])) + + return True + + def handle_trial_end(self, data): + trial_job_id = data['trial_job_id'] + _ended_trials.add(trial_job_id) + if trial_job_id in _trial_history: + _trial_history.pop(trial_job_id) + if self.assessor is not None: + self.assessor.trial_end(trial_job_id, data['event'] == 'SUCCEEDED') + return True + + def _handle_intermediate_metric_data(self, data): + if data['type'] != 'PERIODICAL': + return True + if self.assessor is None: + return True + + trial_job_id = data['trial_job_id'] + if trial_job_id in _ended_trials: + return True + + history = _trial_history[trial_job_id] + history[data['sequence']] = data['value'] + ordered_history = _sort_history(history) + if len(ordered_history) < data['sequence']: # no user-visible update since last time + return True + + try: + result = self.assessor.assess_trial(trial_job_id, ordered_history) + except Exception as e: + _logger.exception('Assessor error') + + if isinstance(result, bool): + result = AssessResult.Good if result else AssessResult.Bad + elif not isinstance(result, AssessResult): + msg = 'Result of Assessor.assess_trial must be an object of AssessResult, not %s' + raise RuntimeError(msg % type(result)) + + if result is AssessResult.Bad: + _logger.debug('BAD, kill %s', trial_job_id) + send(CommandType.KillTrialJob, json_tricks.dumps(trial_job_id)) + else: + _logger.debug('GOOD') diff --git a/src/sdk/pynni/nni/multi_phase/multi_phase_tuner.py b/src/sdk/pynni/nni/multi_phase/multi_phase_tuner.py new file mode 100644 index 0000000000..1fb10ab676 --- /dev/null +++ b/src/sdk/pynni/nni/multi_phase/multi_phase_tuner.py @@ -0,0 +1,87 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +# associated documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, distribute, +# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or +# substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# ================================================================================================== + + +import logging + +from nni.recoverable import Recoverable + +_logger = logging.getLogger(__name__) + + +class MultiPhaseTuner(Recoverable): + # pylint: disable=no-self-use,unused-argument + + def generate_parameters(self, parameter_id, trial_job_id=None): + """Returns a set of trial (hyper-)parameters, as a serializable object. + User code must override either this function or 'generate_multiple_parameters()'. + parameter_id: int + """ + raise NotImplementedError('Tuner: generate_parameters not implemented') + + def generate_multiple_parameters(self, parameter_id_list): + """Returns multiple sets of trial (hyper-)parameters, as iterable of serializable objects. + Call 'generate_parameters()' by 'count' times by default. + User code must override either this function or 'generate_parameters()'. + parameter_id_list: list of int + """ + return [self.generate_parameters(parameter_id) for parameter_id in parameter_id_list] + + def receive_trial_result(self, parameter_id, parameters, reward, trial_job_id): + """Invoked when a trial reports its final result. Must override. + parameter_id: int + parameters: object created by 'generate_parameters()' + reward: object reported by trial + """ + raise NotImplementedError('Tuner: receive_trial_result not implemented') + + def receive_customized_trial_result(self, parameter_id, parameters, reward, trial_job_id): + """Invoked when a trial added by WebUI reports its final result. Do nothing by default. + parameter_id: int + parameters: object created by user + reward: object reported by trial + """ + _logger.info('Customized trial job %s ignored by tuner', parameter_id) + + def update_search_space(self, search_space): + """Update the search space of tuner. Must override. + search_space: JSON object + """ + raise NotImplementedError('Tuner: update_search_space not implemented') + + def load_checkpoint(self): + """Load the checkpoint of tuner. + path: checkpoint directory for tuner + """ + checkpoin_path = self.get_checkpoint_path() + _logger.info('Load checkpoint ignored by tuner, checkpoint path: %s' % checkpoin_path) + + def save_checkpoint(self): + """Save the checkpoint of tuner. + path: checkpoint directory for tuner + """ + checkpoin_path = self.get_checkpoint_path() + _logger.info('Save checkpoint ignored by tuner, checkpoint path: %s' % checkpoin_path) + + def _on_exit(self): + pass + + def _on_error(self): + pass diff --git a/src/sdk/pynni/nni/platform/local.py b/src/sdk/pynni/nni/platform/local.py index 3dda9c4c57..9a2f596c9d 100644 --- a/src/sdk/pynni/nni/platform/local.py +++ b/src/sdk/pynni/nni/platform/local.py @@ -18,23 +18,41 @@ # OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ================================================================================================== - +import json_tricks import json import os +import time -from ..common import init_logger +from ..common import init_logger, env_args _dir = os.environ['NNI_SYS_DIR'] _metric_file = open(os.path.join(_dir, '.nni', 'metrics'), 'wb') +_param_index = 0 _log_file_path = os.path.join(_dir, 'trial.log') init_logger(_log_file_path) +def _send_request_parameter_metric(): + metric = json_tricks.dumps({ + 'trial_job_id': env_args.trial_job_id, + 'type': 'REQUEST_PARAMETER', + 'sequence': 0, + 'parameter_index': _param_index + }) + send_metric(metric) def get_parameters(): - params_file = open(os.path.join(_dir, 'parameter.cfg'), 'r') - return json.load(params_file) + global _param_index + params_filepath = os.path.join(_dir, 'parameter_{}.cfg'.format(_param_index)) + if not os.path.isfile(params_filepath): + _send_request_parameter_metric() + while not os.path.isfile(params_filepath): + time.sleep(3) + params_file = open(params_filepath, 'r') + params = json.load(params_file) + _param_index += 1 + return params def send_metric(string): data = (string + '\n').encode('utf8') diff --git a/src/sdk/pynni/nni/protocol.py b/src/sdk/pynni/nni/protocol.py index 6c0e71506d..ada5527bfa 100644 --- a/src/sdk/pynni/nni/protocol.py +++ b/src/sdk/pynni/nni/protocol.py @@ -34,6 +34,7 @@ class CommandType(Enum): # out NewTrialJob = b'TR' + SendTrialJobParameter = b'SP' NoMoreTrialJobs = b'NO' KillTrialJob = b'KI' @@ -55,7 +56,7 @@ def send(command, data): data = data.encode('utf8') assert len(data) < 1000000, 'Command too long' msg = b'%b%06d%b' % (command.value, len(data), data) - logging.getLogger(__name__).debug('Sending command, data: [%s]' % data) + logging.getLogger(__name__).debug('Sending command, data: [%s]' % msg) _out_file.write(msg) _out_file.flush() diff --git a/src/sdk/pynni/nni/trial.py b/src/sdk/pynni/nni/trial.py index e5884f8dac..27dc864d23 100644 --- a/src/sdk/pynni/nni/trial.py +++ b/src/sdk/pynni/nni/trial.py @@ -32,11 +32,13 @@ ] -_params = platform.get_parameters() +_params = None def get_parameters(): """Returns a set of (hyper-)paremeters generated by Tuner.""" + global _params + _params = platform.get_parameters() return _params['parameters'] @@ -51,6 +53,7 @@ def report_intermediate_result(metric): metric: serializable object. """ global _intermediate_seq + assert _params is not None, 'nni.get_parameters() needs to be called before report_intermediate_result' metric = json_tricks.dumps({ 'parameter_id': _params['parameter_id'], 'trial_job_id': env_args.trial_job_id, @@ -66,6 +69,7 @@ def report_final_result(metric): """Reports final result to tuner. metric: serializable object. """ + assert _params is not None, 'nni.get_parameters() needs to be called before report_final_result' metric = json_tricks.dumps({ 'parameter_id': _params['parameter_id'], 'trial_job_id': env_args.trial_job_id, diff --git a/src/sdk/pynni/tests/test_multi_phase_tuner.py b/src/sdk/pynni/tests/test_multi_phase_tuner.py new file mode 100644 index 0000000000..72b477999e --- /dev/null +++ b/src/sdk/pynni/tests/test_multi_phase_tuner.py @@ -0,0 +1,89 @@ +import logging +import random +from io import BytesIO + +import nni +import nni.protocol +from nni.protocol import CommandType, send, receive +from nni.multi_phase.multi_phase_tuner import MultiPhaseTuner +from nni.multi_phase.multi_phase_dispatcher import MultiPhaseMsgDispatcher + +from unittest import TestCase, main + +class NaiveMultiPhaseTuner(MultiPhaseTuner): + ''' + supports only choices + ''' + def __init__(self): + self.search_space = None + + def generate_parameters(self, parameter_id, trial_job_id=None): + """Returns a set of trial (hyper-)parameters, as a serializable object. + User code must override either this function or 'generate_multiple_parameters()'. + parameter_id: int + """ + generated_parameters = {} + if self.search_space is None: + raise AssertionError('Search space not specified') + for k in self.search_space: + param = self.search_space[k] + if not param['_type'] == 'choice': + raise ValueError('Only choice type is supported') + param_values = param['_value'] + generated_parameters[k] = param_values[random.randint(0, len(param_values)-1)] + logging.getLogger(__name__).debug(generated_parameters) + return generated_parameters + + + def receive_trial_result(self, parameter_id, parameters, reward, trial_job_id): + logging.getLogger(__name__).debug('receive_trial_result: {},{},{},{}'.format(parameter_id, parameters, reward, trial_job_id)) + + def receive_customized_trial_result(self, parameter_id, parameters, reward, trial_job_id): + pass + + def update_search_space(self, search_space): + self.search_space = search_space + + +_in_buf = BytesIO() +_out_buf = BytesIO() + +def _reverse_io(): + _in_buf.seek(0) + _out_buf.seek(0) + nni.protocol._out_file = _in_buf + nni.protocol._in_file = _out_buf + +def _restore_io(): + _in_buf.seek(0) + _out_buf.seek(0) + nni.protocol._in_file = _in_buf + nni.protocol._out_file = _out_buf + +def _test_tuner(): + _reverse_io() # now we are sending to Tuner's incoming stream + send(CommandType.UpdateSearchSpace, "{\"learning_rate\": {\"_value\": [0.0001, 0.001, 0.002, 0.005, 0.01], \"_type\": \"choice\"}, \"optimizer\": {\"_value\": [\"Adam\", \"SGD\"], \"_type\": \"choice\"}}") + send(CommandType.RequestTrialJobs, '2') + send(CommandType.ReportMetricData, '{"parameter_id":0,"type":"PERIODICAL","value":10,"trial_job_id":"abc"}') + send(CommandType.ReportMetricData, '{"parameter_id":1,"type":"FINAL","value":11,"trial_job_id":"abc"}') + send(CommandType.AddCustomizedTrialJob, '{"param":-1}') + send(CommandType.ReportMetricData, '{"parameter_id":2,"type":"FINAL","value":22,"trial_job_id":"abc"}') + send(CommandType.RequestTrialJobs, '1') + send(CommandType.TrialEnd, '{"trial_job_id":"abc"}') + _restore_io() + + tuner = NaiveMultiPhaseTuner() + dispatcher = MultiPhaseMsgDispatcher(tuner) + dispatcher.run() + + _reverse_io() # now we are receiving from Tuner's outgoing stream + + command, data = receive() # this one is customized + print(command, data) + +class MultiPhaseTestCase(TestCase): + def test_tuner(self): + _test_tuner() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/naive/run.py b/test/naive/run.py index 239bedcd2c..f54fe7ab71 100644 --- a/test/naive/run.py +++ b/test/naive/run.py @@ -54,7 +54,7 @@ def run(): if trial > current_trial: current_trial = trial print('Trial #%d done' % trial) - + subprocess.run(['nnictl', 'log', 'stderr']) assert tuner_status == 'DONE' and assessor_status == 'DONE', 'Failed to finish in 1 min' ss1 = json.load(open('search_space.json')) diff --git a/tools/nnicmd/config_schema.py b/tools/nnicmd/config_schema.py index 8cd8431151..57b9fd14e8 100644 --- a/tools/nnicmd/config_schema.py +++ b/tools/nnicmd/config_schema.py @@ -41,8 +41,8 @@ 'codeDir': os.path.exists, 'classFileName': str, 'className': str, - 'classArgs': { - 'optimize_mode': Or('maximize', 'minimize'), + Optional('classArgs'): { + Optional('optimize_mode'): Or('maximize', 'minimize'), Optional('speed'): int }, Optional('gpuNum'): And(int, lambda x: 0 <= x <= 99999), diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index b99a428b1d..a0b382b1ec 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -28,10 +28,10 @@ from nni_annotation import * import random from .launcher_utils import validate_all_content -from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick +from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response from .url_utils import cluster_metadata_url, experiment_url from .config_utils import Config -from .common_utils import get_yml_content, get_json_content, print_error, print_normal +from .common_utils import get_yml_content, get_json_content, print_error, print_normal, detect_process from .constants import EXPERIMENT_SUCCESS_INFO, STDOUT_FULL_PATH, STDERR_FULL_PATH, LOG_DIR, REST_PORT, ERROR_INFO, NORMAL_INFO from .webui_utils import start_web_ui, check_web_ui @@ -40,7 +40,8 @@ def start_rest_server(port, platform, mode, experiment_id=None): print_normal('Checking experiment...') nni_config = Config() rest_port = nni_config.get_config('restServerPort') - if rest_port and check_rest_server_quick(rest_port): + running, _ = check_rest_server_quick(rest_port) + if rest_port and running: print_error('There is an experiment running, please stop it first...') print_normal('You can use \'nnictl stop\' command to stop an experiment!') exit(0) @@ -66,7 +67,12 @@ def set_trial_config(experiment_config, port): value_dict['gpuNum'] = experiment_config['trial']['gpuNum'] request_data['trial_config'] = value_dict response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20) - return True if response.status_code == 200 else False + if check_response(response): + return True + else: + with open(STDERR_FULL_PATH, 'a+') as fout: + fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) + return False def set_local_config(experiment_config, port): '''set local configuration''' @@ -79,9 +85,11 @@ def set_remote_config(experiment_config, port): request_data['machine_list'] = experiment_config['machineList'] response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20) err_message = '' - if not response or not response.status_code == 200: + if not response or not check_response(response): if response is not None: err_message = response.text + with open(STDERR_FULL_PATH, 'a+') as fout: + fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message #set trial_config @@ -117,11 +125,22 @@ def set_experiment(experiment_config, mode, port): {'key': 'trial_config', 'value': value_dict}) response = rest_post(experiment_url(port), json.dumps(request_data), 20) - return response if response.status_code == 200 else None + if check_response(response): + return response + else: + with open(STDERR_FULL_PATH, 'a+') as fout: + fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) + return None def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=None): '''follow steps to start rest server and start experiment''' nni_config = Config() + #Check if there is an experiment running + origin_rest_pid = nni_config.get_config('restServerPid') + if origin_rest_pid and detect_process(origin_rest_pid): + print_error('There is an experiment running, please stop it first...') + print_normal('You can use \'nnictl stop\' command to stop an experiment!') + exit(0) # start rest server rest_process = start_rest_server(REST_PORT, experiment_config['trainingServicePlatform'], mode, experiment_id) nni_config.set_config('restServerPid', rest_process.pid) @@ -144,7 +163,8 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No # check rest server print_normal('Checking restful server...') - if check_rest_server(REST_PORT): + running, _ = check_rest_server(REST_PORT) + if running: print_normal('Restful server start success!') else: print_error('Restful server start failed!') diff --git a/tools/nnicmd/launcher_utils.py b/tools/nnicmd/launcher_utils.py index dc97910244..d9eb7347b8 100644 --- a/tools/nnicmd/launcher_utils.py +++ b/tools/nnicmd/launcher_utils.py @@ -99,7 +99,8 @@ def parse_tuner_content(experiment_config): if experiment_config['tuner'].get('builtinTunerName') and experiment_config['tuner'].get('classArgs'): experiment_config['tuner']['className'] = tuner_class_name_dict.get(experiment_config['tuner']['builtinTunerName']) - experiment_config['tuner']['classArgs']['algorithm_name'] = tuner_algorithm_name_dict.get(experiment_config['tuner']['builtinTunerName']) + if tuner_algorithm_name_dict.get(experiment_config['tuner']['builtinTunerName']): + experiment_config['tuner']['classArgs']['algorithm_name'] = tuner_algorithm_name_dict.get(experiment_config['tuner']['builtinTunerName']) elif experiment_config['tuner'].get('codeDir') and experiment_config['tuner'].get('classFileName') and experiment_config['tuner'].get('className'): if not os.path.exists(os.path.join(experiment_config['tuner']['codeDir'], experiment_config['tuner']['classFileName'])): raise ValueError('Tuner file directory is not valid!') diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 73b2950a55..9762ca82c6 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -25,11 +25,11 @@ from .nnictl_utils import * def nni_help_info(*args): - print('please run "nnictl --help" to see nnictl guidance') + print('please run "nnictl {positional argument} --help" to see nnictl guidance') def parse_args(): '''Definite the arguments users need to follow and input''' - parser = argparse.ArgumentParser(prog='nni ctl', description='use nni control') + parser = argparse.ArgumentParser(prog='nnictl', description='use nnictl command to control nni experiments') parser.set_defaults(func=nni_help_info) # create subparsers for args with sub values @@ -95,6 +95,8 @@ def parse_args(): parser_experiment_subparsers = parser_experiment.add_subparsers() parser_experiment_show = parser_experiment_subparsers.add_parser('show', help='show the information of experiment') parser_experiment_show.set_defaults(func=list_experiment) + parser_experiment_status = parser_experiment_subparsers.add_parser('status', help='show the status of experiment') + parser_experiment_status.set_defaults(func=experiment_status) #parse config command parser_config = subparsers.add_parser('config', help='get config information') diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 80c778bc2f..7305df0980 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -23,7 +23,7 @@ import json import datetime from subprocess import call, check_output -from .rest_utils import rest_get, rest_delete, check_rest_server_quick +from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .config_utils import Config from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url from .constants import STDERR_FULL_PATH, STDOUT_FULL_PATH @@ -47,7 +47,8 @@ def check_rest(args): '''check if restful server is running''' nni_config = Config() rest_port = nni_config.get_config('restServerPort') - if check_rest_server_quick(rest_port): + running, _ = check_rest_server_quick(rest_port) + if not running: print_normal('Restful server is running...') else: print_normal('Restful server is not running...') @@ -62,9 +63,10 @@ def stop_experiment(args): print_normal('Experiment is not running...') stop_web_ui() return - if check_rest_server_quick(rest_port): + running, _ = check_rest_server_quick(rest_port) + if running: response = rest_delete(experiment_url(rest_port), 20) - if not response or response.status_code != 200: + if not response or not check_response(response): print_error('Stop experiment failed!') #sleep to wait rest handler done time.sleep(3) @@ -82,9 +84,10 @@ def trial_ls(args): if not detect_process(rest_pid): print_error('Experiment is not running...') return - if check_rest_server_quick(rest_port): + running, response = check_rest_server_quick(rest_port) + if running: response = rest_get(trial_jobs_url(rest_port), 20) - if response and response.status_code == 200: + if response and check_response(response): content = json.loads(response.text) for index, value in enumerate(content): content[index] = convert_time_stamp_to_date(value) @@ -102,9 +105,10 @@ def trial_kill(args): if not detect_process(rest_pid): print_error('Experiment is not running...') return - if check_rest_server_quick(rest_port): + running, _ = check_rest_server_quick(rest_port) + if running: response = rest_delete(trial_job_id_url(rest_port, args.trialid), 20) - if response and response.status_code == 200: + if response and check_response(response): print(response.text) else: print_error('Kill trial job failed...') @@ -119,9 +123,10 @@ def list_experiment(args): if not detect_process(rest_pid): print_error('Experiment is not running...') return - if check_rest_server_quick(rest_port): + running, _ = check_rest_server_quick(rest_port) + if running: response = rest_get(experiment_url(rest_port), 20) - if response and response.status_code == 200: + if response and check_response(response): content = convert_time_stamp_to_date(json.loads(response.text)) print(json.dumps(content, indent=4, sort_keys=True, separators=(',', ':'))) else: @@ -129,6 +134,16 @@ def list_experiment(args): else: print_error('Restful server is not running...') +def experiment_status(args): + '''Show the status of experiment''' + nni_config = Config() + rest_port = nni_config.get_config('restServerPort') + result, response = check_rest_server_quick(rest_port) + if not result: + print_normal('Restful server is not running...') + else: + print(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) + def get_log_content(file_name, cmds): '''use cmds to read config content''' if os.path.exists(file_name): diff --git a/tools/nnicmd/rest_utils.py b/tools/nnicmd/rest_utils.py index 3952d4a3c7..37690f2463 100644 --- a/tools/nnicmd/rest_utils.py +++ b/tools/nnicmd/rest_utils.py @@ -64,16 +64,22 @@ def check_rest_server(rest_port): response = rest_get(check_status_url(rest_port), 20) if response: if response.status_code == 200: - return True + return True, response else: - return False + return False, response else: time.sleep(3) - return False + return False, response def check_rest_server_quick(rest_port): '''Check if restful server is ready, only check once''' response = rest_get(check_status_url(rest_port), 5) + if response and response.status_code == 200: + return True, response + return False, None + +def check_response(response): + '''Check if a response is success according to status_code''' if response and response.status_code == 200: return True return False diff --git a/tools/nnicmd/updater.py b/tools/nnicmd/updater.py index 1b1fd57a95..c2afc0772d 100644 --- a/tools/nnicmd/updater.py +++ b/tools/nnicmd/updater.py @@ -21,7 +21,7 @@ import json import os -from .rest_utils import rest_put, rest_get, check_rest_server_quick +from .rest_utils import rest_put, rest_get, check_rest_server_quick, check_response from .url_utils import experiment_url from .config_utils import Config from .common_utils import get_json_content @@ -56,13 +56,14 @@ def update_experiment_profile(key, value): '''call restful server to update experiment profile''' nni_config = Config() rest_port = nni_config.get_config('restServerPort') - if check_rest_server_quick(rest_port): + running, _ = check_rest_server_quick(rest_port) + if running: response = rest_get(experiment_url(rest_port), 20) - if response and response.status_code == 200: + if response and check_response(response): experiment_profile = json.loads(response.text) experiment_profile['params'][key] = value response = rest_put(experiment_url(rest_port)+get_query_type(key), json.dumps(experiment_profile), 20) - if response and response.status_code == 200: + if response and check_response(response): return response else: print('ERROR: restful server is not running...') diff --git a/tools/nnicmd/webui_utils.py b/tools/nnicmd/webui_utils.py index 1121452a08..e66d24360a 100644 --- a/tools/nnicmd/webui_utils.py +++ b/tools/nnicmd/webui_utils.py @@ -22,8 +22,8 @@ import os import psutil from socket import AddressFamily -from subprocess import Popen, PIPE -from .rest_utils import rest_get +from subprocess import Popen, PIPE, call +from .rest_utils import rest_get, check_response from .config_utils import Config from .common_utils import print_error, print_normal from .constants import STDOUT_FULL_PATH, STDERR_FULL_PATH @@ -71,6 +71,8 @@ def stop_web_ui(): child_process.kill() if parent_process.is_running(): parent_process.kill() + cmds = ['pkill', '-P', str(webuiPid)] + call(cmds) return True except Exception as e: print_error(e) @@ -84,6 +86,6 @@ def check_web_ui(): return False for url in url_list: response = rest_get(url, 3) - if response and response.status_code == 200: + if response and check_response(response): return True return False