diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 839aa2c4d6..78917d879f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -49,9 +49,9 @@ jobs: sphinx-build -M html . _build -W displayName: 'Sphinx Documentation Build check' -- job: 'ubuntu_1604_python35_legacy_torch_tf' +- job: 'ubuntu_1804_python36_legacy_torch_tf' pool: - vmImage: 'Ubuntu 16.04' + vmImage: 'Ubuntu 18.04' steps: - script: | @@ -141,7 +141,7 @@ jobs: powershell.exe -file install.ps1 displayName: 'Install nni toolkit via source code' - script: | - python -m pip install scikit-learn==0.20.0 --user + python -m pip install scikit-learn==0.23.2 --user python -m pip install keras==2.1.6 --user python -m pip install torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html --user python -m pip install tensorflow==1.15.2 --user diff --git a/deployment/docker/Dockerfile b/deployment/docker/Dockerfile index 5e33cc6047..b5418ec783 100644 --- a/deployment/docker/Dockerfile +++ b/deployment/docker/Dockerfile @@ -1,12 +1,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 +FROM nvidia/cuda:9.2-cudnn7-runtime-ubuntu18.04 LABEL maintainer='Microsoft NNI Team' -RUN DEBIAN_FRONTEND=noninteractive && \ - apt-get -y update && \ +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get -y update && \ apt-get -y install sudo \ apt-utils \ git \ @@ -21,7 +22,7 @@ RUN DEBIAN_FRONTEND=noninteractive && \ openssh-client \ openssh-server \ lsof \ - python3.5 \ + python3.6 \ python3-dev \ python3-pip \ python3-tk \ @@ -37,7 +38,7 @@ RUN cp /usr/bin/python3 /usr/bin/python # # update pip # -RUN python3 -m pip install --upgrade pip setuptools==39.1.0 +RUN python3 -m pip install --upgrade pip==20.0.2 setuptools==39.1.0 # numpy 1.14.3 scipy 1.1.0 RUN python3 -m pip --no-cache-dir install \ @@ -46,7 +47,7 @@ RUN python3 -m pip --no-cache-dir install \ # # Tensorflow 1.15 # -RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.15 +RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.15.0 # # Keras 2.1.6 @@ -60,9 +61,9 @@ RUN python3 -m pip --no-cache-dir install torch==1.4.0 RUN python3 -m pip install torchvision==0.5.0 # -# sklearn 0.20.0 +# sklearn 0.23.2 # -RUN python3 -m pip --no-cache-dir install scikit-learn==0.20.0 +RUN python3 -m pip --no-cache-dir install scikit-learn==0.23.2 # # pandas==0.23.4 lightgbm==2.2.2 diff --git a/deployment/docker/README.md b/deployment/docker/README.md index aa8dae38a4..1ee8d0b3bd 100644 --- a/deployment/docker/README.md +++ b/deployment/docker/README.md @@ -11,7 +11,7 @@ scipy 1.1.0 tensorflow-gpu 1.15.0 keras 2.1.6 torch 1.4.0 -scikit-learn 0.20.0 +scikit-learn 0.23.2 pandas 0.23.4 lightgbm 2.2.2 nni diff --git a/deployment/docker/README_zh_CN.md b/deployment/docker/README_zh_CN.md index a2b243e472..9766f92138 100644 --- a/deployment/docker/README_zh_CN.md +++ b/deployment/docker/README_zh_CN.md @@ -47,4 +47,4 @@ 使用下列命令从 docker Hub 中拉取 NNI docker 映像。 - docker pull msranni/nni:latest \ No newline at end of file + docker pull msranni/nni:latest diff --git a/deployment/pypi/setup.py b/deployment/pypi/setup.py index 61f7ff0178..8968686cd4 100644 --- a/deployment/pypi/setup.py +++ b/deployment/pypi/setup.py @@ -63,7 +63,7 @@ 'scipy', 'coverage', 'colorama', - 'scikit-learn>=0.20,<0.22', + 'scikit-learn>=0.23.2', 'pkginfo', 'websockets' ], diff --git a/docs/en_US/CommunitySharings/ModelCompressionComparison.md b/docs/en_US/CommunitySharings/ModelCompressionComparison.md new file mode 100644 index 0000000000..ba273f9581 --- /dev/null +++ b/docs/en_US/CommunitySharings/ModelCompressionComparison.md @@ -0,0 +1,89 @@ +# Comparison of Filter Pruning Algorithms + +To provide an initial insight into the performance of various filter pruning algorithms, +we conduct extensive experiments with various pruning algorithms on some benchmark models and datasets. +We present the experiment result in this document. +In addition, we provide friendly instructions on the re-implementation of these experiments to facilitate further contributions to this effort. + +## Experiment Setting + +The experiments are performed with the following pruners/datasets/models: + +* Models: [VGG16, ResNet18, ResNet50](https://github.com/microsoft/nni/tree/master/examples/model_compress/models/cifar10) + +* Datasets: CIFAR-10 + +* Pruners: + - These pruners are included: + - Pruners with scheduling : `SimulatedAnnealing Pruner`, `NetAdapt Pruner`, `AutoCompress Pruner`. + Given the overal sparsity requirement, these pruners can automatically generate a sparsity distribution among different layers. + - One-shot pruners: `L1Filter Pruner`, `L2Filter Pruner`, `FPGM Pruner`. + The sparsity of each layer is set the same as the overall sparsity in this experiment. + - Only **filter pruning** performances are compared here. + + For the pruners with scheduling, `L1Filter Pruner` is used as the base algorithm. That is to say, after the sparsities distribution is decided by the scheduling algorithm, `L1Filter Pruner` is used to performn real pruning. + + - All the pruners listed above are implemented in [nni](https://github.com/microsoft/nni/tree/master/docs/en_US/Compressor/Overview.md). + +## Experiment Result + +For each dataset/model/pruner combination, we prune the model to different levels by setting a series of target sparsities for the pruner. + +Here we plot both **Number of Weights - Performances** curve and **FLOPs - Performance** curve. +As a reference, we also plot the result declared in the paper [AutoCompress: An Automatic DNN Structured Pruning Framework for Ultra-High Compression Rates](http://arxiv.org/abs/1907.03141) for models VGG16 and ResNet18 on CIFAR-10. + +The experiment result are shown in the following figures: + +CIFAR-10, VGG16: + +![](../../../examples/model_compress/comparison_of_pruners/img/performance_comparison_vgg16.png) + +CIFAR-10, ResNet18: + +![](../../../examples/model_compress/comparison_of_pruners/img/performance_comparison_resnet18.png) + +CIFAR-10, ResNet50: + +![](../../../examples/model_compress/comparison_of_pruners/img/performance_comparison_resnet50.png) + +## Analysis + +From the experiment result, we get the following conclusions: + +* Given the constraint on the number of parameters, the pruners with scheduling ( `AutoCompress Pruner` , `SimualatedAnnealing Pruner` ) performs better than the others when the constraint is strict. However, they have no such advantage in FLOPs/Performances comparison since only number of parameters constraint is considered in the optimization process; +* The basic algorithms `L1Filter Pruner` , `L2Filter Pruner` , `FPGM Pruner` performs very similarly in these experiments; +* `NetAdapt Pruner` can not achieve very high compression rate. This is caused by its mechanism that it prunes only one layer each pruning iteration. This leads to un-acceptable complexity if the sparsity per iteration is much lower than the overall sparisity constraint. + +## Experiments Reproduction + +### Implementation Details + +* The experiment results are all collected with the default configuration of the pruners in nni, which means that when we call a pruner class in nni, we don't change any default class arguments. + +* Both FLOPs and the number of parameters are counted with [Model FLOPs/Parameters Counter](https://github.com/microsoft/nni/blob/master/docs/en_US/Compressor/CompressionUtils.md#model-flopsparameters-counter) after [model speed up](https://github.com/microsoft/nni/blob/master/docs/en_US/Compressor/ModelSpeedup.md). This avoids potential issues of counting them of masked models. + +* The experiment code can be found [here]( https://github.com/microsoft/nni/tree/master/examples/model_compress/auto_pruners_torch.py). + +### Experiment Result Rendering + +* If you follow the practice in the [example]( https://github.com/microsoft/nni/tree/master/examples/model_compress/auto_pruners_torch.py), for every single pruning experiment, the experiment result will be saved in JSON format as follows: + ``` json + { + "performance": {"original": 0.9298, "pruned": 0.1, "speedup": 0.1, "finetuned": 0.7746}, + "params": {"original": 14987722.0, "speedup": 167089.0}, + "flops": {"original": 314018314.0, "speedup": 38589922.0} + } + ``` + +* The experiment results are saved [here](https://github.com/microsoft/nni/tree/master/examples/model_compress/experiment_data). +You can refer to [analyze](https://github.com/microsoft/nni/tree/master/examples/model_compress/experiment_data/analyze.py) to plot new performance comparison figures. + +## Contribution + +### TODO Items + +* Pruners constrained by FLOPS/latency +* More pruning algorithms/datasets/models + +### Issues +For algorithm implementation & experiment issues, please [create an issue](https://github.com/microsoft/nni/issues/new/). diff --git a/docs/en_US/CommunitySharings/perf_compare.rst b/docs/en_US/CommunitySharings/perf_compare.rst index b87fd167c8..2b80ccdc6c 100644 --- a/docs/en_US/CommunitySharings/perf_compare.rst +++ b/docs/en_US/CommunitySharings/perf_compare.rst @@ -8,4 +8,5 @@ Performance comparison and analysis can help users decide a proper algorithm (e. :maxdepth: 1 Neural Architecture Search Comparison - Hyper-parameter Tuning Algorithm Comparsion \ No newline at end of file + Hyper-parameter Tuning Algorithm Comparsion + Model Compression Algorithm Comparsion \ No newline at end of file diff --git a/docs/en_US/Compressor/Overview.md b/docs/en_US/Compressor/Overview.md index 2d68496545..73298ee8af 100644 --- a/docs/en_US/Compressor/Overview.md +++ b/docs/en_US/Compressor/Overview.md @@ -42,6 +42,7 @@ Pruning algorithms compress the original network by removing redundant weights o | [SimulatedAnnealing Pruner](https://nni.readthedocs.io/en/latest/Compressor/Pruner.html#simulatedannealing-pruner) | Automatic pruning with a guided heuristic search method, Simulated Annealing algorithm [Reference Paper](https://arxiv.org/abs/1907.03141) | | [AutoCompress Pruner](https://nni.readthedocs.io/en/latest/Compressor/Pruner.html#autocompress-pruner) | Automatic pruning by iteratively call SimulatedAnnealing Pruner and ADMM Pruner [Reference Paper](https://arxiv.org/abs/1907.03141) | +You can refer to this [benchmark](https://github.com/microsoft/nni/tree/master/docs/en_US/Benchmark.md) for the performance of these pruners on some benchmark problems. ### Quantization Algorithms diff --git a/docs/en_US/Compressor/Pruner.md b/docs/en_US/Compressor/Pruner.md index 824d8625b3..0901c2a46d 100644 --- a/docs/en_US/Compressor/Pruner.md +++ b/docs/en_US/Compressor/Pruner.md @@ -20,6 +20,8 @@ We provide several pruning algorithms that support fine-grained weight pruning a * [NetAdapt Pruner](#netadapt-pruner) * [SimulatedAnnealing Pruner](#simulatedannealing-pruner) * [AutoCompress Pruner](#autocompress-pruner) +* [AutoML for Model Compression Pruner](#automl-for-model-compression-pruner) +* [Sensitivity Pruner](#sensitivity-pruner) **Others** * [ADMM Pruner](#admm-pruner) @@ -37,7 +39,7 @@ Tensorflow code ```python from nni.compression.tensorflow import LevelPruner config_list = [{ 'sparsity': 0.8, 'op_types': ['default'] }] -pruner = LevelPruner(model_graph, config_list) +pruner = LevelPruner(model, config_list) pruner.compress() ``` @@ -116,17 +118,6 @@ FPGMPruner prune filters with the smallest geometric median. ### Usage -Tensorflow code -```python -from nni.compression.tensorflow import FPGMPruner -config_list = [{ - 'sparsity': 0.5, - 'op_types': ['Conv2D'] -}] -pruner = FPGMPruner(model, config_list) -pruner.compress() -``` - PyTorch code ```python from nni.compression.torch import FPGMPruner @@ -145,11 +136,6 @@ pruner.compress() .. autoclass:: nni.compression.torch.FPGMPruner ``` -##### Tensorflow -```eval_rst -.. autoclass:: nni.compression.tensorflow.FPGMPruner -``` - ## L1Filter Pruner This is an one-shot pruner, In ['PRUNING FILTERS FOR EFFICIENT CONVNETS'](https://arxiv.org/abs/1608.08710), authors Hao Li, Asim Kadav, Igor Durdanovic, Hanan Samet and Hans Peter Graf. @@ -382,12 +368,6 @@ You can view [example](https://github.com/microsoft/nni/blob/master/examples/mod .. autoclass:: nni.compression.torch.AGPPruner ``` -##### Tensorflow - -```eval_rst -.. autoclass:: nni.compression.tensorflow.AGPPruner -``` - *** ## NetAdapt Pruner @@ -497,6 +477,39 @@ You can view [example](https://github.com/microsoft/nni/blob/master/examples/mod .. autoclass:: nni.compression.torch.AutoCompressPruner ``` +## AutoML for Model Compression Pruner + +AutoML for Model Compression Pruner (AMCPruner) leverages reinforcement learning to provide the model compression policy. +This learning-based compression policy outperforms conventional rule-based compression policy by having higher compression ratio, +better preserving the accuracy and freeing human labor. + +![](../../img/amc_pruner.jpg) + +For more details, please refer to [AMC: AutoML for Model Compression and Acceleration on Mobile Devices](https://arxiv.org/pdf/1802.03494.pdf). + + +#### Usage + +PyTorch code + +```python +from nni.compression.torch import AMCPruner +config_list = [{ + 'op_types': ['Conv2d', 'Linear'] + }] +pruner = AMCPruner(model, config_list, evaluator, val_loader, flops_ratio=0.5) +pruner.compress() +``` + +You can view [example](https://github.com/microsoft/nni/blob/master/examples/model_compress/amc/) for more information. + +#### User configuration for AutoCompress Pruner + +##### PyTorch + +```eval_rst +.. autoclass:: nni.compression.torch.AMCPruner +``` ## ADMM Pruner Alternating Direction Method of Multipliers (ADMM) is a mathematical optimization technique, @@ -588,3 +601,35 @@ We try to reproduce the experiment result of the fully connected network on MNIS ![](../../img/lottery_ticket_mnist_fc.png) The above figure shows the result of the fully connected network. `round0-sparsity-0.0` is the performance without pruning. Consistent with the paper, pruning around 80% also obtain similar performance compared to non-pruning, and converges a little faster. If pruning too much, e.g., larger than 94%, the accuracy becomes lower and convergence becomes a little slower. A little different from the paper, the trend of the data in the paper is relatively more clear. + + +## Sensitivity Pruner +For each round, SensitivityPruner prunes the model based on the sensitivity to the accuracy of each layer until meeting the final configured sparsity of the whole model: + 1. Analyze the sensitivity of each layer in the current state of the model. + 2. Prune each layer according to the sensitivity. + +For more details, please refer to [Learning both Weights and Connections for Efficient Neural Networks ](https://arxiv.org/abs/1506.02626). + +#### Usage + +PyTorch code + +```python +from nni.compression.torch import SensitivityPruner +config_list = [{ + 'sparsity': 0.5, + 'op_types': ['Conv2d'] + }] +pruner = SensitivityPruner(model, config_list, finetuner=fine_tuner, evaluator=evaluator) +# eval_args and finetune_args are the parameters passed to the evaluator and finetuner respectively +pruner.compress(eval_args=[model], finetune_args=[model]) +``` + + +#### User configuration for Sensitivity Pruner + +##### PyTorch + +```eval_rst +.. autoclass:: nni.compression.torch.SensitivityPruner +``` diff --git a/docs/en_US/TrainingService/AMLMode.md b/docs/en_US/TrainingService/AMLMode.md index 0907f29a02..b365375b1b 100644 --- a/docs/en_US/TrainingService/AMLMode.md +++ b/docs/en_US/TrainingService/AMLMode.md @@ -49,30 +49,34 @@ tuner: trial: command: python3 mnist.py codeDir: . - computeTarget: ${replace_to_your_computeTarget} image: msranni/nni + gpuNum: 1 amlConfig: subscriptionId: ${replace_to_your_subscriptionId} resourceGroup: ${replace_to_your_resourceGroup} workspaceName: ${replace_to_your_workspaceName} - + computeTarget: ${replace_to_your_computeTarget} ``` Note: You should set `trainingServicePlatform: aml` in NNI config YAML file if you want to start experiment in aml mode. Compared with [LocalMode](LocalMode.md) trial configuration in aml mode have these additional keys: -* computeTarget - * required key. The compute cluster name you want to use in your AML workspace. See Step 6. * image * required key. The docker image name used in job. The image `msranni/nni` of this example only support GPU computeTargets. amlConfig: * subscriptionId - * the subscriptionId of your account + * required key, the subscriptionId of your account * resourceGroup - * the resourceGroup of your account + * required key, the resourceGroup of your account * workspaceName - * the workspaceName of your account + * required key, the workspaceName of your account +* computeTarget + * required key, the compute cluster name you want to use in your AML workspace. See Step 6. +* maxTrialNumPerGpu + * optional key, used to specify the max concurrency trial number on a GPU device. +* useActiveGpu + * optional key, used to specify whether to use a GPU if there is another process. By default, NNI will use the GPU only if there is no other active process in the GPU. The required information of amlConfig could be found in the downloaded `config.json` in Step 5. diff --git a/docs/en_US/Tutorial/Nnictl.md b/docs/en_US/Tutorial/Nnictl.md index ed5c9761e1..adcc4b91e3 100644 --- a/docs/en_US/Tutorial/Nnictl.md +++ b/docs/en_US/Tutorial/Nnictl.md @@ -262,7 +262,7 @@ Debug mode will disable version check function in Trialkeeper. |Name, shorthand|Required|Default|Description| |------|------|------ |------| |id| False| |ID of the experiment you want to set| - |--value, -v| True| |the experiment duration will be NUMBER seconds. SUFFIX may be 's' for seconds (the default), 'm' for minutes, 'h' for hours or 'd' for days.| + |--value, -v| True| | Strings like '1m' for one minute or '2h' for two hours. SUFFIX may be 's' for seconds, 'm' for minutes, 'h' for hours or 'd' for days.| * Example @@ -305,12 +305,14 @@ Debug mode will disable version check function in Trialkeeper. * Description - You can use this command to show trial's information. + You can use this command to show trial's information. Note that if `head` or `tail` is set, only complete trials will be listed. * Usage ```bash nnictl trial ls + nnictl trial ls --head 10 + nnictl trial ls --tail 10 ``` * Options @@ -318,6 +320,8 @@ Debug mode will disable version check function in Trialkeeper. |Name, shorthand|Required|Default|Description| |------|------|------ |------| |id| False| |ID of the experiment you want to set| + |--head|False||the number of items to be listed with the highest default metric| + |--tail|False||the number of items to be listed with the lowest default metric| * __nnictl trial kill__ @@ -444,9 +448,6 @@ Debug mode will disable version check function in Trialkeeper. |--all| False| |delete all of experiments| - - - * __nnictl experiment export__ * Description @@ -465,13 +466,14 @@ Debug mode will disable version check function in Trialkeeper. |id| False| |ID of the experiment | |--filename, -f| True| |File path of the output file | |--type| True| |Type of output file, only support "csv" and "json"| + |--intermediate, -i|False||Are intermediate results included| * Examples > export all trial data in an experiment as json format ```bash - nnictl experiment export [experiment_id] --filename [file_path] --type json + nnictl experiment export [experiment_id] --filename [file_path] --type json --intermediate ``` * __nnictl experiment import__ @@ -531,6 +533,62 @@ Debug mode will disable version check function in Trialkeeper. nnictl experiment import [experiment_id] -f experiment_data.json ``` +* __nnictl experiment save__ + * Description + + Save nni experiment metadata and code data. + + * Usage + + ```bash + nnictl experiment save [OPTIONS] + ``` + + * Options + + |Name, shorthand|Required|Default|Description| + |------|------|------ |------| + |id| True| |The id of the experiment you want to save| + |--path, -p| False| |the folder path to store nni experiment data, default current working directory| + |--saveCodeDir, -s| False| |save codeDir data of the experiment, default False| + + * Examples + + > save an expeirment + + ```bash + nnictl experiment save [experiment_id] --saveCodeDir + ``` + +* __nnictl experiment load__ + * Description + + Load an nni experiment. + + * Usage + + ```bash + nnictl experiment load [OPTIONS] + ``` + + * Options + + |Name, shorthand|Required|Default|Description| + |------|------|------ |------| + |--path, -p| True| |the file path of nni package| + |--codeDir, -c| True| |the path of codeDir for loaded experiment, this path will also put the code in the loaded experiment package| + |--logDir, -l| False| |the path of logDir for loaded experiment| + + * Examples + + > load an expeirment + + ```bash + nnictl experiment load --path [path] --codeDir [codeDir] + ``` + + + ### Manage platform information @@ -850,4 +908,3 @@ Debug mode will disable version check function in Trialkeeper. ```bash nnictl --version ``` - diff --git a/docs/en_US/conf.py b/docs/en_US/conf.py index 6962037bfb..2d6ce2fb75 100644 --- a/docs/en_US/conf.py +++ b/docs/en_US/conf.py @@ -17,6 +17,7 @@ import os import sys sys.path.insert(0, os.path.abspath('../../src/sdk/pynni')) +sys.path.insert(1, os.path.abspath('../../src/sdk/pycli')) # -- Project information --------------------------------------------------- diff --git a/docs/en_US/nnicli_ref.md b/docs/en_US/nnicli_ref.md new file mode 100644 index 0000000000..02c8cbbb30 --- /dev/null +++ b/docs/en_US/nnicli_ref.md @@ -0,0 +1,41 @@ +# NNI Client + +NNI client is a python API of `nnictl`, which implements the most commonly used commands. Users can use this API to control their experiments, collect experiment results and conduct advanced analyses based on experiment results in python code directly instead of using command line. Here is an example: + +``` +from nnicli import Experiment + +# create an experiment instance +exp = Experiment() + +# start an experiment, then connect the instance to this experiment +# you can also use `resume_experiment`, `view_experiment` or `connect_experiment` +# only one of them should be called in one instance +exp.start_experiment('nni/examples/trials/mnist-pytorch/config.yml', port=9090) + +# update the experiment's concurrency +exp.update_concurrency(3) + +# get some information about the experiment +print(exp.get_experiment_status()) +print(exp.get_job_statistics()) +print(exp.list_trial_jobs()) + +# stop the experiment, then disconnect the instance from the experiment. +exp.stop_experiment() +``` + +## References + +```eval_rst +.. autoclass:: nnicli.Experiment + :members: +.. autoclass:: nnicli.TrialJob + :members: +.. autoclass:: nnicli.TrialHyperParameters + :members: +.. autoclass:: nnicli.TrialMetricData + :members: +.. autoclass:: nnicli.TrialResult + :members: +``` diff --git a/docs/en_US/sdk_reference.rst b/docs/en_US/sdk_reference.rst index 2602e257b9..ca87bf7500 100644 --- a/docs/en_US/sdk_reference.rst +++ b/docs/en_US/sdk_reference.rst @@ -8,4 +8,5 @@ Python API Reference Auto Tune NAS - Compression Utilities \ No newline at end of file + Compression Utilities + NNI Client \ No newline at end of file diff --git a/docs/img/amc_pruner.jpg b/docs/img/amc_pruner.jpg new file mode 100644 index 0000000000..456dcbc318 Binary files /dev/null and b/docs/img/amc_pruner.jpg differ diff --git a/docs/requirements.txt b/docs/requirements.txt index 721a587a20..59706a60ea 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -13,6 +13,6 @@ peewee nbsphinx schema tensorboard -scikit-learn==0.20 +scikit-learn>=0.23.2 thop https://download.pytorch.org/whl/cpu/torch-1.3.1%2Bcpu-cp37-cp37m-linux_x86_64.whl diff --git a/docs/zh_CN/Compressor/Pruner.md b/docs/zh_CN/Compressor/Pruner.md index f78b8e0f1c..d11829e4b5 100644 --- a/docs/zh_CN/Compressor/Pruner.md +++ b/docs/zh_CN/Compressor/Pruner.md @@ -37,7 +37,7 @@ TensorFlow 代码 ```python from nni.compression.tensorflow import LevelPruner config_list = [{ 'sparsity': 0.8, 'op_types': ['default'] }] -pruner = LevelPruner(model_graph, config_list) +pruner = LevelPruner(model, config_list) pruner.compress() ``` @@ -102,16 +102,6 @@ pruner.compress() ### 用法 -TensorFlow 代码 -```python -from nni.compression.tensorflow import FPGMPruner -config_list = [{ - 'sparsity': 0.5, - 'op_types': ['Conv2D'] -}] -pruner = FPGMPruner(model, config_list) -pruner.compress() -``` PyTorch 代码 ```python from nni.compression.torch import FPGMPruner diff --git a/examples/model_compress/amc/amc_search.py b/examples/model_compress/amc/amc_search.py new file mode 100644 index 0000000000..b08060efb9 --- /dev/null +++ b/examples/model_compress/amc/amc_search.py @@ -0,0 +1,136 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import sys +import argparse +import time + +import torch +import torch.nn as nn + +from nni.compression.torch import AMCPruner +from data import get_split_dataset +from utils import AverageMeter, accuracy + +sys.path.append('../models') + +def parse_args(): + parser = argparse.ArgumentParser(description='AMC search script') + parser.add_argument('--model_type', default='mobilenet', type=str, choices=['mobilenet', 'mobilenetv2'], help='model to prune') + parser.add_argument('--dataset', default='cifar10', type=str, choices=['cifar10', 'imagenet'], help='dataset to use (cifar/imagenet)') + parser.add_argument('--batch_size', default=50, type=int, help='number of data batch size') + parser.add_argument('--data_root', default='./cifar10', type=str, help='dataset path') + parser.add_argument('--flops_ratio', default=0.5, type=float, help='target flops ratio to preserve of the model') + parser.add_argument('--lbound', default=0.2, type=float, help='minimum sparsity') + parser.add_argument('--rbound', default=1., type=float, help='maximum sparsity') + parser.add_argument('--ckpt_path', default=None, type=str, help='manual path of checkpoint') + + parser.add_argument('--train_episode', default=800, type=int, help='number of training episode') + parser.add_argument('--n_gpu', default=1, type=int, help='number of gpu to use') + parser.add_argument('--n_worker', default=16, type=int, help='number of data loader worker') + parser.add_argument('--job', default='train_export', type=str, choices=['train_export', 'export_only'], + help='search best pruning policy and export or just export model with searched policy') + parser.add_argument('--export_path', default=None, type=str, help='path for exporting models') + parser.add_argument('--searched_model_path', default=None, type=str, help='path for searched best wrapped model') + + return parser.parse_args() + + +def get_model_and_checkpoint(model, dataset, checkpoint_path, n_gpu=1): + if model == 'mobilenet' and dataset == 'imagenet': + from mobilenet import MobileNet + net = MobileNet(n_class=1000) + elif model == 'mobilenetv2' and dataset == 'imagenet': + from mobilenet_v2 import MobileNetV2 + net = MobileNetV2(n_class=1000) + elif model == 'mobilenet' and dataset == 'cifar10': + from mobilenet import MobileNet + net = MobileNet(n_class=10) + elif model == 'mobilenetv2' and dataset == 'cifar10': + from mobilenet_v2 import MobileNetV2 + net = MobileNetV2(n_class=10) + else: + raise NotImplementedError + if checkpoint_path: + print('loading {}...'.format(checkpoint_path)) + sd = torch.load(checkpoint_path, map_location=torch.device('cpu')) + if 'state_dict' in sd: # a checkpoint but not a state_dict + sd = sd['state_dict'] + sd = {k.replace('module.', ''): v for k, v in sd.items()} + net.load_state_dict(sd) + + if torch.cuda.is_available() and n_gpu > 0: + net = net.cuda() + if n_gpu > 1: + net = torch.nn.DataParallel(net, range(n_gpu)) + + return net + +def init_data(args): + # split the train set into train + val + # for CIFAR, split 5k for val + # for ImageNet, split 3k for val + val_size = 5000 if 'cifar' in args.dataset else 3000 + train_loader, val_loader, _ = get_split_dataset( + args.dataset, args.batch_size, + args.n_worker, val_size, + data_root=args.data_root, + shuffle=False + ) # same sampling + return train_loader, val_loader + +def validate(val_loader, model, verbose=False): + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + top5 = AverageMeter() + + criterion = nn.CrossEntropyLoss().cuda() + # switch to evaluate mode + model.eval() + end = time.time() + + t1 = time.time() + with torch.no_grad(): + for i, (input, target) in enumerate(val_loader): + target = target.to(device) + input_var = torch.autograd.Variable(input).to(device) + target_var = torch.autograd.Variable(target).to(device) + + # compute output + output = model(input_var) + loss = criterion(output, target_var) + + # measure accuracy and record loss + prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) + losses.update(loss.item(), input.size(0)) + top1.update(prec1.item(), input.size(0)) + top5.update(prec5.item(), input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + t2 = time.time() + if verbose: + print('* Test loss: %.3f top1: %.3f top5: %.3f time: %.3f' % + (losses.avg, top1.avg, top5.avg, t2 - t1)) + return top5.avg + + +if __name__ == "__main__": + args = parse_args() + + device = torch.device('cuda') if torch.cuda.is_available() and args.n_gpu > 0 else torch.device('cpu') + + model = get_model_and_checkpoint(args.model_type, args.dataset, checkpoint_path=args.ckpt_path, n_gpu=args.n_gpu) + _, val_loader = init_data(args) + + config_list = [{ + 'op_types': ['Conv2d', 'Linear'] + }] + pruner = AMCPruner( + model, config_list, validate, val_loader, model_type=args.model_type, dataset=args.dataset, + train_episode=args.train_episode, job=args.job, export_path=args.export_path, + searched_model_path=args.searched_model_path, + flops_ratio=args.flops_ratio, lbound=args.lbound, rbound=args.rbound) + pruner.compress() diff --git a/examples/model_compress/amc/amc_train.py b/examples/model_compress/amc/amc_train.py new file mode 100644 index 0000000000..bedebc044c --- /dev/null +++ b/examples/model_compress/amc/amc_train.py @@ -0,0 +1,234 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import sys +import os +import time +import argparse +import shutil +import math +import numpy as np + +import torch +import torch.nn as nn +import torch.optim as optim +from tensorboardX import SummaryWriter + +from nni.compression.torch.pruning.amc.lib.net_measure import measure_model +from nni.compression.torch.pruning.amc.lib.utils import get_output_folder + +from data import get_dataset +from utils import AverageMeter, accuracy, progress_bar + +sys.path.append('../models') +from mobilenet import MobileNet +from mobilenet_v2 import MobileNetV2 + +def parse_args(): + parser = argparse.ArgumentParser(description='AMC train / fine-tune script') + parser.add_argument('--model_type', default='mobilenet', type=str, help='name of the model to train') + parser.add_argument('--dataset', default='cifar10', type=str, help='name of the dataset to train') + parser.add_argument('--lr', default=0.1, type=float, help='learning rate') + parser.add_argument('--n_gpu', default=1, type=int, help='number of GPUs to use') + parser.add_argument('--batch_size', default=128, type=int, help='batch size') + parser.add_argument('--n_worker', default=4, type=int, help='number of data loader worker') + parser.add_argument('--lr_type', default='exp', type=str, help='lr scheduler (exp/cos/step3/fixed)') + parser.add_argument('--n_epoch', default=50, type=int, help='number of epochs to train') + parser.add_argument('--wd', default=4e-5, type=float, help='weight decay') + parser.add_argument('--seed', default=None, type=int, help='random seed to set') + parser.add_argument('--data_root', default='./data', type=str, help='dataset path') + # resume + parser.add_argument('--ckpt_path', default=None, type=str, help='checkpoint path to fine tune') + # run eval + parser.add_argument('--eval', action='store_true', help='Simply run eval') + parser.add_argument('--calc_flops', action='store_true', help='Calculate flops') + + return parser.parse_args() + +def get_model(args): + print('=> Building model..') + + if args.dataset == 'imagenet': + n_class = 1000 + elif args.dataset == 'cifar10': + n_class = 10 + else: + raise NotImplementedError + + if args.model_type == 'mobilenet': + net = MobileNet(n_class=n_class).cuda() + elif args.model_type == 'mobilenetv2': + net = MobileNetV2(n_class=n_class).cuda() + else: + raise NotImplementedError + + if args.ckpt_path is not None: + # the checkpoint can be a saved whole model object exported by amc_search.py, or a state_dict + print('=> Loading checkpoint {} ..'.format(args.ckpt_path)) + ckpt = torch.load(args.ckpt_path) + if type(ckpt) == dict: + net.load_state_dict(ckpt['state_dict']) + else: + net = ckpt + + net.to(args.device) + if torch.cuda.is_available() and args.n_gpu > 1: + net = torch.nn.DataParallel(net, list(range(args.n_gpu))) + return net + +def train(epoch, train_loader, device): + print('\nEpoch: %d' % epoch) + net.train() + + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + top5 = AverageMeter() + end = time.time() + + for batch_idx, (inputs, targets) in enumerate(train_loader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + outputs = net(inputs) + loss = criterion(outputs, targets) + + loss.backward() + optimizer.step() + + # measure accuracy and record loss + prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) + losses.update(loss.item(), inputs.size(0)) + top1.update(prec1.item(), inputs.size(0)) + top5.update(prec5.item(), inputs.size(0)) + # timing + batch_time.update(time.time() - end) + end = time.time() + + progress_bar(batch_idx, len(train_loader), 'Loss: {:.3f} | Acc1: {:.3f}% | Acc5: {:.3f}%' + .format(losses.avg, top1.avg, top5.avg)) + writer.add_scalar('loss/train', losses.avg, epoch) + writer.add_scalar('acc/train_top1', top1.avg, epoch) + writer.add_scalar('acc/train_top5', top5.avg, epoch) + +def test(epoch, test_loader, device, save=True): + global best_acc + net.eval() + + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + top5 = AverageMeter() + end = time.time() + + with torch.no_grad(): + for batch_idx, (inputs, targets) in enumerate(test_loader): + inputs, targets = inputs.to(device), targets.to(device) + outputs = net(inputs) + loss = criterion(outputs, targets) + + # measure accuracy and record loss + prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) + losses.update(loss.item(), inputs.size(0)) + top1.update(prec1.item(), inputs.size(0)) + top5.update(prec5.item(), inputs.size(0)) + # timing + batch_time.update(time.time() - end) + end = time.time() + + progress_bar(batch_idx, len(test_loader), 'Loss: {:.3f} | Acc1: {:.3f}% | Acc5: {:.3f}%' + .format(losses.avg, top1.avg, top5.avg)) + + if save: + writer.add_scalar('loss/test', losses.avg, epoch) + writer.add_scalar('acc/test_top1', top1.avg, epoch) + writer.add_scalar('acc/test_top5', top5.avg, epoch) + + is_best = False + if top1.avg > best_acc: + best_acc = top1.avg + is_best = True + + print('Current best acc: {}'.format(best_acc)) + save_checkpoint({ + 'epoch': epoch, + 'model': args.model_type, + 'dataset': args.dataset, + 'state_dict': net.module.state_dict() if isinstance(net, nn.DataParallel) else net.state_dict(), + 'acc': top1.avg, + 'optimizer': optimizer.state_dict(), + }, is_best, checkpoint_dir=log_dir) + +def adjust_learning_rate(optimizer, epoch): + if args.lr_type == 'cos': # cos without warm-up + lr = 0.5 * args.lr * (1 + math.cos(math.pi * epoch / args.n_epoch)) + elif args.lr_type == 'exp': + step = 1 + decay = 0.96 + lr = args.lr * (decay ** (epoch // step)) + elif args.lr_type == 'fixed': + lr = args.lr + else: + raise NotImplementedError + print('=> lr: {}'.format(lr)) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + return lr + +def save_checkpoint(state, is_best, checkpoint_dir='.'): + filename = os.path.join(checkpoint_dir, 'ckpt.pth.tar') + print('=> Saving checkpoint to {}'.format(filename)) + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, filename.replace('.pth.tar', '.best.pth.tar')) + +if __name__ == '__main__': + args = parse_args() + + if torch.cuda.is_available(): + torch.backends.cudnn.benchmark = True + args.device = torch.device('cuda') if torch.cuda.is_available() and args.n_gpu > 0 else torch.device('cpu') + + best_acc = 0 # best test accuracy + start_epoch = 0 # start from epoch 0 or last checkpoint epoch + + if args.seed is not None: + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed(args.seed) + + print('=> Preparing data..') + train_loader, val_loader, n_class = get_dataset(args.dataset, args.batch_size, args.n_worker, + data_root=args.data_root) + + net = get_model(args) # for measure + + if args.calc_flops: + IMAGE_SIZE = 224 if args.dataset == 'imagenet' else 32 + n_flops, n_params = measure_model(net, IMAGE_SIZE, IMAGE_SIZE) + print('=> Model Parameter: {:.3f} M, FLOPs: {:.3f}M'.format(n_params / 1e6, n_flops / 1e6)) + exit(0) + + criterion = nn.CrossEntropyLoss() + print('Using SGD...') + print('weight decay = {}'.format(args.wd)) + optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.wd) + + if args.eval: # just run eval + print('=> Start evaluation...') + test(0, val_loader, args.device, save=False) + else: # train + print('=> Start training...') + print('Training {} on {}...'.format(args.model_type, args.dataset)) + train_type = 'train' if args.ckpt_path is None else 'finetune' + log_dir = get_output_folder('./logs', '{}_{}_{}'.format(args.model_type, args.dataset, train_type)) + print('=> Saving logs to {}'.format(log_dir)) + # tf writer + writer = SummaryWriter(logdir=log_dir) + + for epoch in range(start_epoch, start_epoch + args.n_epoch): + lr = adjust_learning_rate(optimizer, epoch) + train(epoch, train_loader, args.device) + test(epoch, val_loader, args.device) + + writer.close() + print('=> Best top-1 acc: {}%'.format(best_acc)) diff --git a/examples/model_compress/amc/data.py b/examples/model_compress/amc/data.py new file mode 100644 index 0000000000..71935b3517 --- /dev/null +++ b/examples/model_compress/amc/data.py @@ -0,0 +1,156 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torchvision +import torchvision.transforms as transforms +import torchvision.datasets as datasets +from torch.utils.data.sampler import SubsetRandomSampler +import numpy as np + +import os + + +def get_dataset(dset_name, batch_size, n_worker, data_root='../../data'): + cifar_tran_train = [ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ] + cifar_tran_test = [ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ] + print('=> Preparing data..') + if dset_name == 'cifar10': + transform_train = transforms.Compose(cifar_tran_train) + transform_test = transforms.Compose(cifar_tran_test) + trainset = torchvision.datasets.CIFAR10(root=data_root, train=True, download=True, transform=transform_train) + train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, + num_workers=n_worker, pin_memory=True, sampler=None) + testset = torchvision.datasets.CIFAR10(root=data_root, train=False, download=True, transform=transform_test) + val_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, + num_workers=n_worker, pin_memory=True) + n_class = 10 + elif dset_name == 'imagenet': + # get dir + traindir = os.path.join(data_root, 'train') + valdir = os.path.join(data_root, 'val') + + # preprocessing + input_size = 224 + imagenet_tran_train = [ + transforms.RandomResizedCrop(input_size, scale=(0.2, 1.0)), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + imagenet_tran_test = [ + transforms.Resize(int(input_size / 0.875)), + transforms.CenterCrop(input_size), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] + + train_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(traindir, transforms.Compose(imagenet_tran_train)), + batch_size=batch_size, shuffle=True, + num_workers=n_worker, pin_memory=True, sampler=None) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose(imagenet_tran_test)), + batch_size=batch_size, shuffle=False, + num_workers=n_worker, pin_memory=True) + n_class = 1000 + + else: + raise NotImplementedError + + return train_loader, val_loader, n_class + + +def get_split_dataset(dset_name, batch_size, n_worker, val_size, data_root='../data', shuffle=True): + ''' + split the train set into train / val for rl search + ''' + if shuffle: + index_sampler = SubsetRandomSampler + else: # every time we use the same order for the split subset + class SubsetSequentialSampler(SubsetRandomSampler): + def __iter__(self): + return (self.indices[i] for i in torch.arange(len(self.indices)).int()) + index_sampler = SubsetSequentialSampler + + print('=> Preparing data: {}...'.format(dset_name)) + if dset_name == 'cifar10': + transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ]) + trainset = torchvision.datasets.CIFAR100(root=data_root, train=True, download=True, transform=transform_train) + valset = torchvision.datasets.CIFAR10(root=data_root, train=True, download=True, transform=transform_test) + n_train = len(trainset) + indices = list(range(n_train)) + # now shuffle the indices + #np.random.shuffle(indices) + assert val_size < n_train + train_idx, val_idx = indices[val_size:], indices[:val_size] + + train_sampler = index_sampler(train_idx) + val_sampler = index_sampler(val_idx) + + train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=False, sampler=train_sampler, + num_workers=n_worker, pin_memory=True) + val_loader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False, sampler=val_sampler, + num_workers=n_worker, pin_memory=True) + n_class = 10 + elif dset_name == 'imagenet': + train_dir = os.path.join(data_root, 'train') + val_dir = os.path.join(data_root, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + input_size = 224 + train_transform = transforms.Compose([ + transforms.RandomResizedCrop(input_size), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ]) + test_transform = transforms.Compose([ + transforms.Resize(int(input_size/0.875)), + transforms.CenterCrop(input_size), + transforms.ToTensor(), + normalize, + ]) + + trainset = datasets.ImageFolder(train_dir, train_transform) + valset = datasets.ImageFolder(train_dir, test_transform) + n_train = len(trainset) + indices = list(range(n_train)) + np.random.shuffle(indices) + assert val_size < n_train + train_idx, val_idx = indices[val_size:], indices[:val_size] + + train_sampler = index_sampler(train_idx) + val_sampler = index_sampler(val_idx) + + train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, sampler=train_sampler, + num_workers=n_worker, pin_memory=True) + val_loader = torch.utils.data.DataLoader(valset, batch_size=batch_size, sampler=val_sampler, + num_workers=n_worker, pin_memory=True) + + n_class = 1000 + else: + raise NotImplementedError + + return train_loader, val_loader, n_class diff --git a/examples/model_compress/amc/utils.py b/examples/model_compress/amc/utils.py new file mode 100644 index 0000000000..d1b17be065 --- /dev/null +++ b/examples/model_compress/amc/utils.py @@ -0,0 +1,138 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import sys +import os +import time + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + if self.count > 0: + self.avg = self.sum / self.count + + def accumulate(self, val, n=1): + self.sum += val + self.count += n + if self.count > 0: + self.avg = self.sum / self.count + + +def accuracy(output, target, topk=(1, 5)): + """Computes the precision@k for the specified values of k""" + batch_size = target.size(0) + num = output.size(1) + target_topk = [] + appendices = [] + for k in topk: + if k <= num: + target_topk.append(k) + else: + appendices.append([0.0]) + topk = target_topk + maxk = max(topk) + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + appendices + + +# Custom progress bar +_, term_width = os.popen('stty size', 'r').read().split() +term_width = int(term_width) +TOTAL_BAR_LENGTH = 40. +last_time = time.time() +begin_time = last_time + + +def progress_bar(current, total, msg=None): + def format_time(seconds): + days = int(seconds / 3600 / 24) + seconds = seconds - days * 3600 * 24 + hours = int(seconds / 3600) + seconds = seconds - hours * 3600 + minutes = int(seconds / 60) + seconds = seconds - minutes * 60 + secondsf = int(seconds) + seconds = seconds - secondsf + millis = int(seconds * 1000) + + f = '' + i = 1 + if days > 0: + f += str(days) + 'D' + i += 1 + if hours > 0 and i <= 2: + f += str(hours) + 'h' + i += 1 + if minutes > 0 and i <= 2: + f += str(minutes) + 'm' + i += 1 + if secondsf > 0 and i <= 2: + f += str(secondsf) + 's' + i += 1 + if millis > 0 and i <= 2: + f += str(millis) + 'ms' + i += 1 + if f == '': + f = '0ms' + return f + + global last_time, begin_time + if current == 0: + begin_time = time.time() # Reset for new bar. + + cur_len = int(TOTAL_BAR_LENGTH*current/total) + rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1 + + sys.stdout.write(' [') + for i in range(cur_len): + sys.stdout.write('=') + sys.stdout.write('>') + for i in range(rest_len): + sys.stdout.write('.') + sys.stdout.write(']') + + cur_time = time.time() + step_time = cur_time - last_time + last_time = cur_time + tot_time = cur_time - begin_time + + L = [] + L.append(' Step: %s' % format_time(step_time)) + L.append(' | Tot: %s' % format_time(tot_time)) + if msg: + L.append(' | ' + msg) + + msg = ''.join(L) + sys.stdout.write(msg) + for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3): + sys.stdout.write(' ') + + # Go back to the center of the bar. + for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2): + sys.stdout.write('\b') + sys.stdout.write(' %d/%d ' % (current+1, total)) + + if current < total-1: + sys.stdout.write('\r') + else: + sys.stdout.write('\n') + sys.stdout.flush() diff --git a/examples/model_compress/auto_pruners_torch.py b/examples/model_compress/auto_pruners_torch.py index 9f0678b6f0..33ecfb8f5f 100644 --- a/examples/model_compress/auto_pruners_torch.py +++ b/examples/model_compress/auto_pruners_torch.py @@ -9,78 +9,81 @@ import json import torch from torch.optim.lr_scheduler import StepLR, MultiStepLR -from torchvision import datasets, transforms, models +from torchvision import datasets, transforms from models.mnist.lenet import LeNet from models.cifar10.vgg import VGG -from nni.compression.torch import L1FilterPruner, SimulatedAnnealingPruner, ADMMPruner, NetAdaptPruner, AutoCompressPruner +from models.cifar10.resnet import ResNet18, ResNet50 +from nni.compression.torch import L1FilterPruner, L2FilterPruner, FPGMPruner +from nni.compression.torch import SimulatedAnnealingPruner, ADMMPruner, NetAdaptPruner, AutoCompressPruner from nni.compression.torch import ModelSpeedup +from nni.compression.torch.utils.counter import count_flops_params -def get_data(args): +def get_data(dataset, data_dir, batch_size, test_batch_size): ''' get data ''' kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else { } - if args.dataset == 'mnist': + if dataset == 'mnist': train_loader = torch.utils.data.DataLoader( - datasets.MNIST(args.data_dir, train=True, download=True, + datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), - batch_size=args.batch_size, shuffle=True, **kwargs) + batch_size=batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( - datasets.MNIST(args.data_dir, train=False, + datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), - batch_size=args.test_batch_size, shuffle=True, **kwargs) + batch_size=test_batch_size, shuffle=True, **kwargs) criterion = torch.nn.NLLLoss() - elif args.dataset == 'cifar10': + elif dataset == 'cifar10': normalize = transforms.Normalize( (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) train_loader = torch.utils.data.DataLoader( - datasets.CIFAR10(args.data_dir, train=True, transform=transforms.Compose([ + datasets.CIFAR10(data_dir, train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor(), normalize, ]), download=True), - batch_size=args.batch_size, shuffle=True, **kwargs) + batch_size=batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( - datasets.CIFAR10(args.data_dir, train=False, transform=transforms.Compose([ + datasets.CIFAR10(data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), - batch_size=args.batch_size, shuffle=False, **kwargs) + batch_size=batch_size, shuffle=False, **kwargs) criterion = torch.nn.CrossEntropyLoss() - elif args.dataset == 'imagenet': + elif dataset == 'imagenet': normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader( - datasets.ImageFolder(os.path.join(args.data_dir, 'train'), + datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), - batch_size=args.batch_size, shuffle=True, **kwargs) + batch_size=batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( - datasets.ImageFolder(os.path.join(args.data_dir, 'val'), + datasets.ImageFolder(os.path.join(data_dir, 'val'), transform=transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), - batch_size=args.test_batch_size, shuffle=True, **kwargs) + batch_size=test_batch_size, shuffle=True, **kwargs) criterion = torch.nn.CrossEntropyLoss() return train_loader, val_loader, criterion @@ -127,65 +130,91 @@ def test(model, device, criterion, val_loader): return accuracy -def get_trained_model(args, device, train_loader, val_loader, criterion): +def get_trained_model_optimizer(args, device, train_loader, val_loader, criterion): if args.model == 'LeNet': model = LeNet().to(device) - optimizer = torch.optim.Adadelta(model.parameters(), lr=1) - scheduler = StepLR(optimizer, step_size=1, gamma=0.7) - for epoch in range(args.pretrain_epochs): - train(args, model, device, train_loader, - criterion, optimizer, epoch) - scheduler.step() + if args.load_pretrained_model: + model.load_state_dict(torch.load(args.pretrained_model_dir)) + optimizer = torch.optim.Adadelta(model.parameters(), lr=1e-4) + else: + optimizer = torch.optim.Adadelta(model.parameters(), lr=1) + scheduler = StepLR(optimizer, step_size=1, gamma=0.7) elif args.model == 'vgg16': model = VGG(depth=16).to(device) - optimizer = torch.optim.SGD(model.parameters(), lr=0.01, - momentum=0.9, - weight_decay=5e-4) - scheduler = MultiStepLR( - optimizer, milestones=[int(args.pretrain_epochs*0.5), int(args.pretrain_epochs*0.75)], gamma=0.1) - for epoch in range(args.pretrain_epochs): - train(args, model, device, train_loader, - criterion, optimizer, epoch) - scheduler.step() + if args.load_pretrained_model: + model.load_state_dict(torch.load(args.pretrained_model_dir)) + optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9, weight_decay=5e-4) + else: + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) + scheduler = MultiStepLR( + optimizer, milestones=[int(args.pretrain_epochs*0.5), int(args.pretrain_epochs*0.75)], gamma=0.1) elif args.model == 'resnet18': - model = models.resnet18(pretrained=False, num_classes=10).to(device) - optimizer = torch.optim.SGD(model.parameters(), lr=0.01, - momentum=0.9, - weight_decay=5e-4) - scheduler = MultiStepLR( - optimizer, milestones=[int(args.pretrain_epochs*0.5), int(args.pretrain_epochs*0.75)], gamma=0.1) + model = ResNet18().to(device) + if args.load_pretrained_model: + model.load_state_dict(torch.load(args.pretrained_model_dir)) + optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9, weight_decay=5e-4) + else: + optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) + scheduler = MultiStepLR( + optimizer, milestones=[int(args.pretrain_epochs*0.5), int(args.pretrain_epochs*0.75)], gamma=0.1) + elif args.model == 'resnet50': + model = ResNet50().to(device) + if args.load_pretrained_model: + model.load_state_dict(torch.load(args.pretrained_model_dir)) + optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9, weight_decay=5e-4) + else: + optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) + scheduler = MultiStepLR( + optimizer, milestones=[int(args.pretrain_epochs*0.5), int(args.pretrain_epochs*0.75)], gamma=0.1) + else: + raise ValueError("model not recognized") + + if not args.load_pretrained_model: + best_acc = 0 + best_epoch = 0 for epoch in range(args.pretrain_epochs): - train(args, model, device, train_loader, - criterion, optimizer, epoch) + train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() - elif args.model == 'mobilenet_v2': - model = models.mobilenet_v2(pretrained=True).to(device) - - if args.save_model: - torch.save(model.state_dict(), os.path.join( - args.experiment_data_dir, 'model_trained.pth')) - print('Model trained saved to %s', args.experiment_data_dir) + acc = test(model, device, criterion, val_loader) + if acc > best_acc: + best_acc = acc + best_epoch = epoch + state_dict = model.state_dict() + model.load_state_dict(state_dict) + print('Best acc:', best_acc) + print('Best epoch:', best_epoch) + + if args.save_model: + torch.save(state_dict, os.path.join(args.experiment_data_dir, 'model_trained.pth')) + print('Model trained saved to %s', args.experiment_data_dir) return model, optimizer def get_dummy_input(args, device): if args.dataset == 'mnist': - dummy_input = torch.randn( - [args.test_batch_size, 1, 28, 28]).to(device) + dummy_input = torch.randn([args.test_batch_size, 1, 28, 28]).to(device) elif args.dataset in ['cifar10', 'imagenet']: - dummy_input = torch.randn( - [args.test_batch_size, 3, 32, 32]).to(device) - + dummy_input = torch.randn([args.test_batch_size, 3, 32, 32]).to(device) return dummy_input +def get_input_size(dataset): + if dataset == 'mnist': + input_size = (1, 1, 28, 28) + elif dataset == 'cifar10': + input_size = (1, 3, 32, 32) + elif dataset == 'imagenet': + input_size = (1, 3, 256, 256) + return input_size + + def main(args): # prepare dataset torch.manual_seed(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - train_loader, val_loader, criterion = get_data(args) - model, optimizer = get_trained_model(args, device, train_loader, val_loader, criterion) + train_loader, val_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size) + model, optimizer = get_trained_model_optimizer(args, device, train_loader, val_loader, criterion) def short_term_fine_tuner(model, epochs=1): for epoch in range(epochs): @@ -198,11 +227,15 @@ def evaluator(model): return test(model, device, criterion, val_loader) # used to save the performance of the original & pruned & finetuned models - result = {} + result = {'flops': {}, 'params': {}, 'performance':{}} + + flops, params = count_flops_params(model, get_input_size(args.dataset)) + result['flops']['original'] = flops + result['params']['original'] = params evaluation_result = evaluator(model) print('Evaluation result (original model): %s' % evaluation_result) - result['original'] = evaluation_result + result['performance']['original'] = evaluation_result # module types to prune, only "Conv2d" supported for channel pruning if args.base_algo in ['l1', 'l2']: @@ -218,6 +251,10 @@ def evaluator(model): if args.pruner == 'L1FilterPruner': pruner = L1FilterPruner(model, config_list) + elif args.pruner == 'L2FilterPruner': + pruner = L2FilterPruner(model, config_list) + elif args.pruner == 'FPGMPruner': + pruner = FPGMPruner(model, config_list) elif args.pruner == 'NetAdaptPruner': pruner = NetAdaptPruner(model, config_list, short_term_fine_tuner=short_term_fine_tuner, evaluator=evaluator, base_algo=args.base_algo, experiment_data_dir=args.experiment_data_dir) @@ -263,99 +300,123 @@ def evaluator(model): experiment_data_dir=args.experiment_data_dir) else: raise ValueError( - "Please use L1FilterPruner, NetAdaptPruner, SimulatedAnnealingPruner, ADMMPruner or AutoCompressPruner in this example.") + "Pruner not supported.") # Pruner.compress() returns the masked model # but for AutoCompressPruner, Pruner.compress() returns directly the pruned model - model_masked = pruner.compress() - evaluation_result = evaluator(model_masked) + model = pruner.compress() + evaluation_result = evaluator(model) print('Evaluation result (masked model): %s' % evaluation_result) - result['pruned'] = evaluation_result + result['performance']['pruned'] = evaluation_result if args.save_model: pruner.export_model( os.path.join(args.experiment_data_dir, 'model_masked.pth'), os.path.join(args.experiment_data_dir, 'mask.pth')) print('Masked model saved to %s', args.experiment_data_dir) + # model speed up + if args.speed_up: + if args.pruner != 'AutoCompressPruner': + if args.model == 'LeNet': + model = LeNet().to(device) + elif args.model == 'vgg16': + model = VGG(depth=16).to(device) + elif args.model == 'resnet18': + model = ResNet18().to(device) + elif args.model == 'resnet50': + model = ResNet50().to(device) + + model.load_state_dict(torch.load(os.path.join(args.experiment_data_dir, 'model_masked.pth'))) + masks_file = os.path.join(args.experiment_data_dir, 'mask.pth') + + m_speedup = ModelSpeedup(model, dummy_input, masks_file, device) + m_speedup.speedup_model() + evaluation_result = evaluator(model) + print('Evaluation result (speed up model): %s' % evaluation_result) + result['performance']['speedup'] = evaluation_result + + torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_speed_up.pth')) + print('Speed up model saved to %s', args.experiment_data_dir) + flops, params = count_flops_params(model, get_input_size(args.dataset)) + result['flops']['speedup'] = flops + result['params']['speedup'] = params + if args.fine_tune: if args.dataset == 'mnist': - optimizer = torch.optim.Adadelta(model_masked.parameters(), lr=1) - scheduler = StepLR(optimizer, step_size=1, gamma=0.7) - for epoch in range(args.fine_tune_epochs): - train(args, model_masked, device, train_loader, criterion, optimizer, epoch) - scheduler.step() - test(model_masked, device, criterion, val_loader) - elif args.dataset == 'cifar10': - optimizer = torch.optim.SGD(model_masked.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) + optimizer = torch.optim.Adadelta(model.parameters(), lr=1) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) - for epoch in range(args.fine_tune_epochs): - train(args, model_masked, device, train_loader, criterion, optimizer, epoch) - scheduler.step() - test(model_masked, device, criterion, val_loader) - elif args.dataset == 'imagenet': - for epoch in range(args.fine_tune_epochs): - optimizer = torch.optim.SGD(model_masked.parameters(), lr=0.05, momentum=0.9, weight_decay=5e-4) - train(args, model_masked, device, train_loader, criterion, optimizer, epoch) - test(model_masked, device, criterion, val_loader) - - evaluation_result = evaluator(model_masked) - print('Evaluation result (fine tuned): %s' % evaluation_result) - result['finetuned'] = evaluation_result + elif args.dataset == 'cifar10' and args.model == 'vgg16': + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) + scheduler = MultiStepLR( + optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) + elif args.dataset == 'cifar10' and args.model == 'resnet18': + optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) + scheduler = MultiStepLR( + optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) + elif args.dataset == 'cifar10' and args.model == 'resnet50': + optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) + scheduler = MultiStepLR( + optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) + best_acc = 0 + for epoch in range(args.fine_tune_epochs): + train(args, model, device, train_loader, criterion, optimizer, epoch) + scheduler.step() + acc = evaluator(model) + if acc > best_acc: + best_acc = acc + torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_fine_tuned.pth')) - if args.save_model: - pruner.export_model(os.path.join( - args.experiment_data_dir, 'model_fine_tuned.pth'), os.path.join(args.experiment_data_dir, 'mask.pth')) - print('Fined tuned model saved to %s', args.experiment_data_dir) + print('Evaluation result (fine tuned): %s' % best_acc) + print('Fined tuned model saved to %s', args.experiment_data_dir) + result['performance']['finetuned'] = best_acc - # model speed up - if args.speed_up and args.pruner != 'AutoCompressPruner': - if args.model == 'LeNet': - model = LeNet().to(device) - elif args.model == 'vgg16': - model = VGG(depth=16).to(device) - elif args.model == 'resnet18': - model = models.resnet18(pretrained=False, num_classes=10).to(device) - elif args.model == 'mobilenet_v2': - model = models.mobilenet_v2(pretrained=False).to(device) - - model.load_state_dict(torch.load(os.path.join(args.experiment_data_dir, 'model_fine_tuned.pth'))) - masks_file = os.path.join(args.experiment_data_dir, 'mask.pth') - - m_speedup = ModelSpeedup(model, dummy_input, masks_file, device) - m_speedup.speedup_model() - evaluation_result = evaluator(model) - print('Evaluation result (speed up model): %s' % evaluation_result) - result['speedup'] = evaluation_result - - torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_speed_up.pth')) - print('Speed up model saved to %s', args.experiment_data_dir) - - with open(os.path.join(args.experiment_data_dir, 'performance.json'), 'w+') as f: + with open(os.path.join(args.experiment_data_dir, 'result.json'), 'w+') as f: json.dump(result, f) if __name__ == '__main__': - def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): + def str2bool(s): + if isinstance(s, bool): + return s + if s.lower() in ('yes', 'true', 't', 'y', '1'): return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): + if s.lower() in ('no', 'false', 'f', 'n', '0'): return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') + raise argparse.ArgumentTypeError('Boolean value expected.') parser = argparse.ArgumentParser(description='PyTorch Example for SimulatedAnnealingPruner') + # dataset and model + parser.add_argument('--dataset', type=str, default='cifar10', + help='dataset to use, mnist, cifar10 or imagenet') + parser.add_argument('--data-dir', type=str, default='./data/', + help='dataset directory') + parser.add_argument('--model', type=str, default='vgg16', + help='model to use, LeNet, vgg16, resnet18 or resnet50') + parser.add_argument('--load-pretrained-model', type=str2bool, default=False, + help='whether to load pretrained model') + parser.add_argument('--pretrained-model-dir', type=str, default='./', + help='path to pretrained model') + parser.add_argument('--pretrain-epochs', type=int, default=100, + help='number of epochs to pretrain the model') + parser.add_argument('--batch-size', type=int, default=64, + help='input batch size for training (default: 64)') + parser.add_argument('--test-batch-size', type=int, default=64, + help='input batch size for testing (default: 64)') + parser.add_argument('--fine-tune', type=str2bool, default=True, + help='whether to fine-tune the pruned model') + parser.add_argument('--fine-tune-epochs', type=int, default=5, + help='epochs to fine tune') + parser.add_argument('--experiment-data-dir', type=str, default='./experiment_data', + help='For saving experiment data') + + # pruner parser.add_argument('--pruner', type=str, default='SimulatedAnnealingPruner', - help='pruner to use, L1FilterPruner, NetAdaptPruner, SimulatedAnnealingPruner, ADMMPruner or AutoCompressPruner') + help='pruner to use') parser.add_argument('--base-algo', type=str, default='l1', help='base pruning algorithm. level, l1 or l2') - parser.add_argument('--sparsity', type=float, default=0.3, - help='overall target sparsity') - parser.add_argument('--speed-up', type=str2bool, default=False, - help='Whether to speed-up the pruned model') - + parser.add_argument('--sparsity', type=float, default=0.1, + help='target overall target sparsity') # param for SimulatedAnnealingPruner parser.add_argument('--cool-down-rate', type=float, default=0.9, help='cool down rate') @@ -363,29 +424,16 @@ def str2bool(v): parser.add_argument('--sparsity-per-iteration', type=float, default=0.05, help='sparsity_per_iteration of NetAdaptPruner') - parser.add_argument('--dataset', type=str, default='mnist', - help='dataset to use, mnist, cifar10 or imagenet (default MNIST)') - parser.add_argument('--model', type=str, default='LeNet', - help='model to use, LeNet, vgg16, resnet18 or mobilenet_v2') - parser.add_argument('--fine-tune', type=str2bool, default=True, - help='whether to fine-tune the pruned model') - parser.add_argument('--fine-tune-epochs', type=int, default=10, - help='epochs to fine tune') - parser.add_argument('--data-dir', type=str, default='/datasets/', - help='dataset directory') - parser.add_argument('--experiment-data-dir', type=str, default='./', - help='For saving experiment data') + # speed-up + parser.add_argument('--speed-up', type=str2bool, default=False, + help='Whether to speed-up the pruned model') - parser.add_argument('--batch-size', type=int, default=64, - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=64, - help='input batch size for testing (default: 64)') - parser.add_argument('--pretrain-epochs', type=int, default=1, - help='number of epochs to pretrain the model') + # others parser.add_argument('--log-interval', type=int, default=200, help='how many batches to wait before logging training status') parser.add_argument('--save-model', type=str2bool, default=True, help='For Saving the current Model') + args = parser.parse_args() if not os.path.exists(args.experiment_data_dir): diff --git a/examples/model_compress/comparison_of_pruners/analyze.py b/examples/model_compress/comparison_of_pruners/analyze.py new file mode 100644 index 0000000000..c7cd13f72a --- /dev/null +++ b/examples/model_compress/comparison_of_pruners/analyze.py @@ -0,0 +1,107 @@ +import argparse +import json +import matplotlib.pyplot as plt + + +def plot_performance_comparison(args): + # reference data, performance of the original model and the performance declared in the AutoCompress Paper + references = { + 'original':{ + 'cifar10':{ + 'vgg16':{ + 'performance': 0.9298, + 'params':14987722.0, + 'flops':314018314.0 + }, + 'resnet18':{ + 'performance': 0.9433, + 'params':11173962.0, + 'flops':556651530.0 + }, + 'resnet50':{ + 'performance': 0.9488, + 'params':23520842.0, + 'flops':1304694794.0 + } + } + }, + 'AutoCompressPruner':{ + 'cifar10':{ + 'vgg16':{ + 'performance': 0.9321, + 'params':52.2, # times + 'flops':8.8 + }, + 'resnet18':{ + 'performance': 0.9381, + 'params':54.2, # times + 'flops':12.2 + } + } + } + } + + markers = ['v', '^', '<', '1', '2', '3', '4', '8', '*', '+', 'o'] + + with open('cifar10/comparison_result_{}.json'.format(args.model), 'r') as jsonfile: + result = json.load(jsonfile) + + pruners = result.keys() + + performances = {} + flops = {} + params = {} + sparsities = {} + for pruner in pruners: + performances[pruner] = [val['performance'] for val in result[pruner]] + flops[pruner] = [val['flops'] for val in result[pruner]] + params[pruner] = [val['params'] for val in result[pruner]] + sparsities[pruner] = [val['sparsity'] for val in result[pruner]] + + fig, axs = plt.subplots(2, 1, figsize=(8, 10)) + fig.suptitle('Channel Pruning Comparison on {}/CIFAR10'.format(args.model)) + fig.subplots_adjust(hspace=0.5) + + for idx, pruner in enumerate(pruners): + axs[0].scatter(params[pruner], performances[pruner], marker=markers[idx], label=pruner) + axs[1].scatter(flops[pruner], performances[pruner], marker=markers[idx], label=pruner) + + # references + params_original = references['original']['cifar10'][args.model]['params'] + performance_original = references['original']['cifar10'][args.model]['performance'] + axs[0].plot(params_original, performance_original, 'rx', label='original model') + if args.model in ['vgg16', 'resnet18']: + axs[0].plot(params_original/references['AutoCompressPruner']['cifar10'][args.model]['params'], + references['AutoCompressPruner']['cifar10'][args.model]['performance'], + 'bx', label='AutoCompress Paper') + + axs[0].set_title("Performance v.s. Number of Parameters") + axs[0].set_xlabel("Number of Parameters") + axs[0].set_ylabel('Accuracy') + axs[0].legend() + + # references + flops_original = references['original']['cifar10'][args.model]['flops'] + performance_original = references['original']['cifar10'][args.model]['performance'] + axs[1].plot(flops_original, performance_original, 'rx', label='original model') + if args.model in ['vgg16', 'resnet18']: + axs[1].plot(flops_original/references['AutoCompressPruner']['cifar10'][args.model]['flops'], + references['AutoCompressPruner']['cifar10'][args.model]['performance'], + 'bx', label='AutoCompress Paper') + + axs[1].set_title("Performance v.s. FLOPs") + axs[1].set_xlabel("FLOPs") + axs[1].set_ylabel('Accuracy') + axs[1].legend() + + plt.savefig('img/performance_comparison_{}.png'.format(args.model)) + plt.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument('--model', type=str, default='vgg16', + help='vgg16, resnet18 or resnet50') + args = parser.parse_args() + + plot_performance_comparison(args) diff --git a/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_resnet18.json b/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_resnet18.json new file mode 100644 index 0000000000..0ef5a6119d --- /dev/null +++ b/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_resnet18.json @@ -0,0 +1,392 @@ +{ + "L1FilterPruner": [ + { + "sparsity": 0.1, + "params": 9642085.0, + "flops": 496882684.0, + "performance": 0.9436 + }, + { + "sparsity": 0.2, + "params": 8149126.0, + "flops": 436381222.0, + "performance": 0.9472 + }, + { + "sparsity": 0.3, + "params": 6705269.0, + "flops": 371666312.0, + "performance": 0.9391 + }, + { + "sparsity": 0.4, + "params": 5335138.0, + "flops": 307050934.0, + "performance": 0.9433 + }, + { + "sparsity": 0.5, + "params": 3998122.0, + "flops": 237900244.0, + "performance": 0.9379 + }, + { + "sparsity": 0.6, + "params": 2767325.0, + "flops": 175308326.0, + "performance": 0.9326 + }, + { + "sparsity": 0.7, + "params": 1617817.0, + "flops": 108532198.0, + "performance": 0.928 + }, + { + "sparsity": 0.8, + "params": 801338.0, + "flops": 53808728.0, + "performance": 0.9145 + }, + { + "sparsity": 0.9, + "params": 229372.0, + "flops": 15304972.0, + "performance": 0.8858 + }, + { + "sparsity": 0.95, + "params": 61337.0, + "flops": 4305146.0, + "performance": 0.8441 + }, + { + "sparsity": 0.975, + "params": 17763.0, + "flops": 1561644.0, + "performance": 0.7294 + } + ], + "L2FilterPruner": [ + { + "sparsity": 0.1, + "params": 9680242.0, + "flops": 497492746.0, + "performance": 0.9423 + }, + { + "sparsity": 0.2, + "params": 8137784.0, + "flops": 436199900.0, + "performance": 0.9471 + }, + { + "sparsity": 0.3, + "params": 6702679.0, + "flops": 369733768.0, + "performance": 0.9415 + }, + { + "sparsity": 0.4, + "params": 5330426.0, + "flops": 305512736.0, + "performance": 0.9411 + }, + { + "sparsity": 0.5, + "params": 3961076.0, + "flops": 236467814.0, + "performance": 0.9349 + }, + { + "sparsity": 0.6, + "params": 2776512.0, + "flops": 175872204.0, + "performance": 0.9393 + }, + { + "sparsity": 0.7, + "params": 1622571.0, + "flops": 107994906.0, + "performance": 0.9295 + }, + { + "sparsity": 0.8, + "params": 797075.0, + "flops": 53534414.0, + "performance": 0.9187 + }, + { + "sparsity": 0.9, + "params": 232153.0, + "flops": 15385078.0, + "performance": 0.8838 + }, + { + "sparsity": 0.95, + "params": 58180.0, + "flops": 4510072.0, + "performance": 0.8396 + }, + { + "sparsity": 0.975, + "params": 16836.0, + "flops": 1429752.0, + "performance": 0.7482 + } + ], + "FPGMPruner": [ + { + "sparsity": 0.1, + "params": 9705680.0, + "flops": 497899454.0, + "performance": 0.9443 + }, + { + "sparsity": 0.2, + "params": 8160468.0, + "flops": 436562544.0, + "performance": 0.946 + }, + { + "sparsity": 0.3, + "params": 6710052.0, + "flops": 367960482.0, + "performance": 0.9452 + }, + { + "sparsity": 0.4, + "params": 5334205.0, + "flops": 306166432.0, + "performance": 0.9412 + }, + { + "sparsity": 0.5, + "params": 4007259.0, + "flops": 237702210.0, + "performance": 0.9385 + }, + { + "sparsity": 0.6, + "params": 2782236.0, + "flops": 175813620.0, + "performance": 0.9304 + }, + { + "sparsity": 0.7, + "params": 1634603.0, + "flops": 108904676.0, + "performance": 0.9249 + }, + { + "sparsity": 0.8, + "params": 799610.0, + "flops": 53645918.0, + "performance": 0.9203 + }, + { + "sparsity": 0.9, + "params": 233644.0, + "flops": 15408784.0, + "performance": 0.8856 + }, + { + "sparsity": 0.95, + "params": 56518.0, + "flops": 4266910.0, + "performance": 0.83 + }, + { + "sparsity": 0.975, + "params": 17610.0, + "flops": 1441836.0, + "performance": 0.7356 + } + ], + "NetAdaptPruner": [ + { + "sparsity": 0.1, + "params": 11173962.0, + "flops": 556651530.0, + "performance": 0.9474 + }, + { + "sparsity": 0.2, + "params": 10454958.0, + "flops": 545147466.0, + "performance": 0.9482 + }, + { + "sparsity": 0.3, + "params": 9299986.0, + "flops": 526681564.0, + "performance": 0.9469 + }, + { + "sparsity": 0.4, + "params": 8137618.0, + "flops": 508087276.0, + "performance": 0.9451 + }, + { + "sparsity": 0.5, + "params": 6267654.0, + "flops": 478185102.0, + "performance": 0.947 + }, + { + "sparsity": 0.6, + "params": 5277444.0, + "flops": 462341742.0, + "performance": 0.9469 + }, + { + "sparsity": 0.7, + "params": 4854190.0, + "flops": 455580628.0, + "performance": 0.9466 + }, + { + "sparsity": 0.8, + "params": 3531098.0, + "flops": 434411156.0, + "performance": 0.9472 + } + ], + "SimulatedAnnealingPruner": [ + { + "sparsity": 0.1, + "params": 10307424.0, + "flops": 537697098.0, + "performance": 0.942 + }, + { + "sparsity": 0.2, + "params": 9264598.0, + "flops": 513101368.0, + "performance": 0.9456 + }, + { + "sparsity": 0.3, + "params": 7999316.0, + "flops": 489260738.0, + "performance": 0.946 + }, + { + "sparsity": 0.4, + "params": 6996176.0, + "flops": 450768626.0, + "performance": 0.9413 + }, + { + "sparsity": 0.5, + "params": 5412616.0, + "flops": 408698434.0, + "performance": 0.9477 + }, + { + "sparsity": 0.6, + "params": 5106924.0, + "flops": 391735326.0, + "performance": 0.9483 + }, + { + "sparsity": 0.7, + "params": 3032105.0, + "flops": 269777978.0, + "performance": 0.9414 + }, + { + "sparsity": 0.8, + "params": 2423230.0, + "flops": 294783862.0, + "performance": 0.9384 + }, + { + "sparsity": 0.9, + "params": 1151046.0, + "flops": 209639226.0, + "performance": 0.939 + }, + { + "sparsity": 0.95, + "params": 394406.0, + "flops": 108776618.0, + "performance": 0.923 + }, + { + "sparsity": 0.975, + "params": 250649.0, + "flops": 84645050.0, + "performance": 0.917 + } + ], + "AutoCompressPruner": [ + { + "sparsity": 0.1, + "params": 10238286.0, + "flops": 536590794.0, + "performance": 0.9406 + }, + { + "sparsity": 0.2, + "params": 9272049.0, + "flops": 512333916.0, + "performance": 0.9392 + }, + { + "sparsity": 0.3, + "params": 8099915.0, + "flops": 485418056.0, + "performance": 0.9398 + }, + { + "sparsity": 0.4, + "params": 6864547.0, + "flops": 449359492.0, + "performance": 0.9406 + }, + { + "sparsity": 0.5, + "params": 6106994.0, + "flops": 430766432.0, + "performance": 0.9397 + }, + { + "sparsity": 0.6, + "params": 5338096.0, + "flops": 415085278.0, + "performance": 0.9384 + }, + { + "sparsity": 0.7, + "params": 3701330.0, + "flops": 351057878.0, + "performance": 0.938 + }, + { + "sparsity": 0.8, + "params": 2229760.0, + "flops": 269058346.0, + "performance": 0.9388 + }, + { + "sparsity": 0.9, + "params": 1108564.0, + "flops": 189355930.0, + "performance": 0.9348 + }, + { + "sparsity": 0.95, + "params": 616893.0, + "flops": 159314256.0, + "performance": 0.93 + }, + { + "sparsity": 0.975, + "params": 297368.0, + "flops": 113398292.0, + "performance": 0.9072 + } + ] +} \ No newline at end of file diff --git a/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_resnet50.json b/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_resnet50.json new file mode 100644 index 0000000000..dcea274149 --- /dev/null +++ b/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_resnet50.json @@ -0,0 +1,356 @@ +{ + "L1FilterPruner": [ + { + "sparsity": 0.1, + "params": 20378141.0, + "flops": 1134740738.0, + "performance": 0.9456 + }, + { + "sparsity": 0.2, + "params": 17286560.0, + "flops": 966734852.0, + "performance": 0.9433 + }, + { + "sparsity": 0.3, + "params": 14403947.0, + "flops": 807114812.0, + "performance": 0.9396 + }, + { + "sparsity": 0.4, + "params": 11558288.0, + "flops": 656314106.0, + "performance": 0.9402 + }, + { + "sparsity": 0.5, + "params": 8826728.0, + "flops": 507965924.0, + "performance": 0.9394 + }, + { + "sparsity": 0.6, + "params": 6319902.0, + "flops": 374211960.0, + "performance": 0.9372 + }, + { + "sparsity": 0.7, + "params": 4063713.0, + "flops": 246788556.0, + "performance": 0.9304 + }, + { + "sparsity": 0.8, + "params": 2120717.0, + "flops": 133614422.0, + "performance": 0.9269 + }, + { + "sparsity": 0.9, + "params": 652524.0, + "flops": 41973714.0, + "performance": 0.9081 + }, + { + "sparsity": 0.95, + "params": 195468.0, + "flops": 13732020.0, + "performance": 0.8723 + }, + { + "sparsity": 0.975, + "params": 58054.0, + "flops": 4268104.0, + "performance": 0.7941 + } + ], + "L2FilterPruner": [ + { + "sparsity": 0.1, + "params": 20378141.0, + "flops": 1134740738.0, + "performance": 0.9442 + }, + { + "sparsity": 0.2, + "params": 17275244.0, + "flops": 966400928.0, + "performance": 0.9463 + }, + { + "sparsity": 0.3, + "params": 14415409.0, + "flops": 807710914.0, + "performance": 0.9367 + }, + { + "sparsity": 0.4, + "params": 11564310.0, + "flops": 656653008.0, + "performance": 0.9391 + }, + { + "sparsity": 0.5, + "params": 8843266.0, + "flops": 508086256.0, + "performance": 0.9381 + }, + { + "sparsity": 0.6, + "params": 6316815.0, + "flops": 373882614.0, + "performance": 0.9368 + }, + { + "sparsity": 0.7, + "params": 4054272.0, + "flops": 246477678.0, + "performance": 0.935 + }, + { + "sparsity": 0.8, + "params": 2129321.0, + "flops": 134527520.0, + "performance": 0.9275 + }, + { + "sparsity": 0.9, + "params": 667500.0, + "flops": 42927060.0, + "performance": 0.9129 + }, + { + "sparsity": 0.95, + "params": 192464.0, + "flops": 13669430.0, + "performance": 0.8757 + }, + { + "sparsity": 0.975, + "params": 58250.0, + "flops": 4365620.0, + "performance": 0.7978 + } + ], + "FPGMPruner": [ + { + "sparsity": 0.1, + "params": 20401570.0, + "flops": 1135114552.0, + "performance": 0.9438 + }, + { + "sparsity": 0.2, + "params": 17321414.0, + "flops": 967137398.0, + "performance": 0.9427 + }, + { + "sparsity": 0.3, + "params": 14418221.0, + "flops": 807755756.0, + "performance": 0.9422 + }, + { + "sparsity": 0.4, + "params": 11565000.0, + "flops": 655412124.0, + "performance": 0.9403 + }, + { + "sparsity": 0.5, + "params": 8829840.0, + "flops": 506715294.0, + "performance": 0.9355 + }, + { + "sparsity": 0.6, + "params": 6308085.0, + "flops": 374231682.0, + "performance": 0.9359 + }, + { + "sparsity": 0.7, + "params": 4054237.0, + "flops": 246511714.0, + "performance": 0.9285 + }, + { + "sparsity": 0.8, + "params": 2134187.0, + "flops": 134456366.0, + "performance": 0.9275 + }, + { + "sparsity": 0.9, + "params": 665931.0, + "flops": 42859752.0, + "performance": 0.9083 + }, + { + "sparsity": 0.95, + "params": 191590.0, + "flops": 13641052.0, + "performance": 0.8762 + }, + { + "sparsity": 0.975, + "params": 57767.0, + "flops": 4350074.0, + "performance": 0.789 + } + ], + "NetAdaptPruner": [ + { + "sparsity": 0.1, + "params": 22348970.0, + "flops": 1275701258.0, + "performance": 0.9404 + }, + { + "sparsity": 0.2, + "params": 21177162.0, + "flops": 1256952330.0, + "performance": 0.9445 + }, + { + "sparsity": 0.3, + "params": 18407434.0, + "flops": 1212636682.0, + "performance": 0.9433 + }, + { + "sparsity": 0.4, + "params": 16061284.0, + "flops": 1175098282.0, + "performance": 0.9401 + } + ], + "SimulatedAnnealingPruner": [ + { + "sparsity": 0.1, + "params": 20551755.0, + "flops": 1230145122.0, + "performance": 0.9438 + }, + { + "sparsity": 0.2, + "params": 17766048.0, + "flops": 1159924128.0, + "performance": 0.9432 + }, + { + "sparsity": 0.3, + "params": 15105146.0, + "flops": 1094478662.0, + "performance": 0.943 + }, + { + "sparsity": 0.4, + "params": 12378092.0, + "flops": 1008801158.0, + "performance": 0.9398 + }, + { + "sparsity": 0.5, + "params": 9890487.0, + "flops": 911941770.0, + "performance": 0.9426 + }, + { + "sparsity": 0.6, + "params": 7638262.0, + "flops": 831218770.0, + "performance": 0.9412 + }, + { + "sparsity": 0.7, + "params": 5469936.0, + "flops": 691881792.0, + "performance": 0.9405 + }, + { + "sparsity": 0.8, + "params": 3668951.0, + "flops": 580850666.0, + "performance": 0.941 + }, + { + "sparsity": 0.9, + "params": 1765284.0, + "flops": 389162310.0, + "performance": 0.9294 + } + ], + "AutoCompressPruner": [ + { + "sparsity": 0.1, + "params": 20660299.0, + "flops": 1228508590.0, + "performance": 0.9337 + }, + { + "sparsity": 0.2, + "params": 17940465.0, + "flops": 1152868146.0, + "performance": 0.9326 + }, + { + "sparsity": 0.3, + "params": 15335831.0, + "flops": 1084996094.0, + "performance": 0.9348 + }, + { + "sparsity": 0.4, + "params": 12821408.0, + "flops": 991305524.0, + "performance": 0.936 + }, + { + "sparsity": 0.5, + "params": 10695425.0, + "flops": 919638860.0, + "performance": 0.9349 + }, + { + "sparsity": 0.6, + "params": 8536821.0, + "flops": 802011678.0, + "performance": 0.9339 + }, + { + "sparsity": 0.7, + "params": 7276898.0, + "flops": 744248114.0, + "performance": 0.9337 + }, + { + "sparsity": 0.8, + "params": 5557721.0, + "flops": 643881710.0, + "performance": 0.9323 + }, + { + "sparsity": 0.9, + "params": 3925140.0, + "flops": 512545272.0, + "performance": 0.9304 + }, + { + "sparsity": 0.95, + "params": 2867004.0, + "flops": 365184762.0, + "performance": 0.9263 + }, + { + "sparsity": 0.975, + "params": 1773257.0, + "flops": 229320266.0, + "performance": 0.9175 + } + ] +} \ No newline at end of file diff --git a/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_vgg16.json b/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_vgg16.json new file mode 100644 index 0000000000..9e476488c1 --- /dev/null +++ b/examples/model_compress/comparison_of_pruners/cifar10/comparison_result_vgg16.json @@ -0,0 +1,392 @@ +{ + "L1FilterPruner": [ + { + "sparsity": 0.1, + "params": 12187336.0, + "flops": 256252606.0, + "performance": 0.9344 + }, + { + "sparsity": 0.2, + "params": 9660216.0, + "flops": 203049930.0, + "performance": 0.9371 + }, + { + "sparsity": 0.3, + "params": 7435417.0, + "flops": 155477470.0, + "performance": 0.9341 + }, + { + "sparsity": 0.4, + "params": 5493954.0, + "flops": 114721578.0, + "performance": 0.9317 + }, + { + "sparsity": 0.5, + "params": 3820010.0, + "flops": 79155722.0, + "performance": 0.9309 + }, + { + "sparsity": 0.6, + "params": 2478632.0, + "flops": 51618494.0, + "performance": 0.9229 + }, + { + "sparsity": 0.7, + "params": 1420600.0, + "flops": 29455306.0, + "performance": 0.9031 + }, + { + "sparsity": 0.8, + "params": 658553.0, + "flops": 13290974.0, + "performance": 0.8756 + }, + { + "sparsity": 0.9, + "params": 186178.0, + "flops": 3574570.0, + "performance": 0.8145 + }, + { + "sparsity": 0.95, + "params": 58680.0, + "flops": 1050570.0, + "performance": 0.6983 + }, + { + "sparsity": 0.975, + "params": 23408.0, + "flops": 329918.0, + "performance": 0.5573 + } + ], + "L2FilterPruner": [ + { + "sparsity": 0.1, + "params": 12187336.0, + "flops": 256252606.0, + "performance": 0.9357 + }, + { + "sparsity": 0.2, + "params": 9660216.0, + "flops": 203049930.0, + "performance": 0.9355 + }, + { + "sparsity": 0.3, + "params": 7435417.0, + "flops": 155477470.0, + "performance": 0.9337 + }, + { + "sparsity": 0.4, + "params": 5493954.0, + "flops": 114721578.0, + "performance": 0.9308 + }, + { + "sparsity": 0.5, + "params": 3820010.0, + "flops": 79155722.0, + "performance": 0.9285 + }, + { + "sparsity": 0.6, + "params": 2478632.0, + "flops": 51618494.0, + "performance": 0.9208 + }, + { + "sparsity": 0.7, + "params": 1420600.0, + "flops": 29455306.0, + "performance": 0.909 + }, + { + "sparsity": 0.8, + "params": 658553.0, + "flops": 13290974.0, + "performance": 0.8698 + }, + { + "sparsity": 0.9, + "params": 186178.0, + "flops": 3574570.0, + "performance": 0.8203 + }, + { + "sparsity": 0.95, + "params": 58680.0, + "flops": 1050570.0, + "performance": 0.7063 + }, + { + "sparsity": 0.975, + "params": 23408.0, + "flops": 329918.0, + "performance": 0.5455 + } + ], + "FPGMPruner": [ + { + "sparsity": 0.1, + "params": 12187336.0, + "flops": 256252606.0, + "performance": 0.937 + }, + { + "sparsity": 0.2, + "params": 9660216.0, + "flops": 203049930.0, + "performance": 0.936 + }, + { + "sparsity": 0.3, + "params": 7435417.0, + "flops": 155477470.0, + "performance": 0.9359 + }, + { + "sparsity": 0.4, + "params": 5493954.0, + "flops": 114721578.0, + "performance": 0.9302 + }, + { + "sparsity": 0.5, + "params": 3820010.0, + "flops": 79155722.0, + "performance": 0.9233 + }, + { + "sparsity": 0.6, + "params": 2478632.0, + "flops": 51618494.0, + "performance": 0.922 + }, + { + "sparsity": 0.7, + "params": 1420600.0, + "flops": 29455306.0, + "performance": 0.9022 + }, + { + "sparsity": 0.8, + "params": 658553.0, + "flops": 13290974.0, + "performance": 0.8794 + }, + { + "sparsity": 0.9, + "params": 186178.0, + "flops": 3574570.0, + "performance": 0.8276 + }, + { + "sparsity": 0.95, + "params": 58680.0, + "flops": 1050570.0, + "performance": 0.6967 + }, + { + "sparsity": 0.975, + "params": 23408.0, + "flops": 329918.0, + "performance": 0.3683 + } + ], + "NetAdaptPruner": [ + { + "sparsity": 0.1, + "params": 13492098.0, + "flops": 308484330.0, + "performance": 0.9376 + }, + { + "sparsity": 0.2, + "params": 11998408.0, + "flops": 297641410.0, + "performance": 0.9374 + }, + { + "sparsity": 0.3, + "params": 10504344.0, + "flops": 281928834.0, + "performance": 0.9369 + }, + { + "sparsity": 0.4, + "params": 8263221.0, + "flops": 272964342.0, + "performance": 0.9382 + }, + { + "sparsity": 0.5, + "params": 6769885.0, + "flops": 249070966.0, + "performance": 0.9388 + }, + { + "sparsity": 0.6, + "params": 6022137.0, + "flops": 237106998.0, + "performance": 0.9383 + }, + { + "sparsity": 0.7, + "params": 4526754.0, + "flops": 222152490.0, + "performance": 0.936 + }, + { + "sparsity": 0.8, + "params": 3032759.0, + "flops": 162401210.0, + "performance": 0.9362 + } + ], + "SimulatedAnnealingPruner": [ + { + "sparsity": 0.1, + "params": 12691704.0, + "flops": 301467870.0, + "performance": 0.9366 + }, + { + "sparsity": 0.2, + "params": 10318461.0, + "flops": 275724450.0, + "performance": 0.9362 + }, + { + "sparsity": 0.3, + "params": 8217127.0, + "flops": 246321046.0, + "performance": 0.9371 + }, + { + "sparsity": 0.4, + "params": 6458368.0, + "flops": 232948294.0, + "performance": 0.9378 + }, + { + "sparsity": 0.5, + "params": 4973079.0, + "flops": 217675254.0, + "performance": 0.9362 + }, + { + "sparsity": 0.6, + "params": 3131526.0, + "flops": 151576878.0, + "performance": 0.9347 + }, + { + "sparsity": 0.7, + "params": 1891036.0, + "flops": 76575574.0, + "performance": 0.9289 + }, + { + "sparsity": 0.8, + "params": 1170751.0, + "flops": 107532322.0, + "performance": 0.9325 + }, + { + "sparsity": 0.9, + "params": 365978.0, + "flops": 46241354.0, + "performance": 0.9167 + }, + { + "sparsity": 0.95, + "params": 167089.0, + "flops": 38589922.0, + "performance": 0.7746 + }, + { + "sparsity": 0.975, + "params": 96779.0, + "flops": 26838230.0, + "performance": 0.1 + } + ], + "AutoCompressPruner": [ + { + "sparsity": 0.1, + "params": 12460277.0, + "flops": 290311730.0, + "performance": 0.9352 + }, + { + "sparsity": 0.2, + "params": 10138147.0, + "flops": 269180938.0, + "performance": 0.9324 + }, + { + "sparsity": 0.3, + "params": 8033350.0, + "flops": 241789714.0, + "performance": 0.9357 + }, + { + "sparsity": 0.4, + "params": 6105156.0, + "flops": 213573294.0, + "performance": 0.9367 + }, + { + "sparsity": 0.5, + "params": 4372604.0, + "flops": 185826362.0, + "performance": 0.9387 + }, + { + "sparsity": 0.6, + "params": 3029629.0, + "flops": 166285498.0, + "performance": 0.9334 + }, + { + "sparsity": 0.7, + "params": 1897060.0, + "flops": 134897806.0, + "performance": 0.9359 + }, + { + "sparsity": 0.8, + "params": 1145509.0, + "flops": 111766450.0, + "performance": 0.9334 + }, + { + "sparsity": 0.9, + "params": 362546.0, + "flops": 50777246.0, + "performance": 0.9261 + }, + { + "sparsity": 0.95, + "params": 149735.0, + "flops": 39201770.0, + "performance": 0.8924 + }, + { + "sparsity": 0.975, + "params": 45378.0, + "flops": 13213974.0, + "performance": 0.8193 + } + ] +} \ No newline at end of file diff --git a/examples/model_compress/comparison_of_pruners/img/performance_comparison_resnet18.png b/examples/model_compress/comparison_of_pruners/img/performance_comparison_resnet18.png new file mode 100644 index 0000000000..87a99e85bd Binary files /dev/null and b/examples/model_compress/comparison_of_pruners/img/performance_comparison_resnet18.png differ diff --git a/examples/model_compress/comparison_of_pruners/img/performance_comparison_resnet50.png b/examples/model_compress/comparison_of_pruners/img/performance_comparison_resnet50.png new file mode 100644 index 0000000000..7214a368b0 Binary files /dev/null and b/examples/model_compress/comparison_of_pruners/img/performance_comparison_resnet50.png differ diff --git a/examples/model_compress/comparison_of_pruners/img/performance_comparison_vgg16.png b/examples/model_compress/comparison_of_pruners/img/performance_comparison_vgg16.png new file mode 100644 index 0000000000..93930561b3 Binary files /dev/null and b/examples/model_compress/comparison_of_pruners/img/performance_comparison_vgg16.png differ diff --git a/examples/model_compress/model_prune_tf.py b/examples/model_compress/model_prune_tf.py new file mode 100644 index 0000000000..99e8278df4 --- /dev/null +++ b/examples/model_compress/model_prune_tf.py @@ -0,0 +1,82 @@ +import argparse + +import tensorflow as tf + +import nni.compression.tensorflow + +prune_config = { + 'level': { + 'dataset_name': 'mnist', + 'model_name': 'naive', + 'pruner_class': nni.compression.tensorflow.LevelPruner, + 'config_list': [{ + 'sparsity': 0.9, + 'op_types': ['default'], + }] + }, +} + + +def get_dataset(dataset_name='mnist'): + assert dataset_name == 'mnist' + + (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() + x_train = x_train[..., tf.newaxis] / 255.0 + x_test = x_test[..., tf.newaxis] / 255.0 + return (x_train, y_train), (x_test, y_test) + + +def create_model(model_name='naive'): + assert model_name == 'naive' + return tf.keras.Sequential([ + tf.keras.layers.Conv2D(filters=20, kernel_size=5), + tf.keras.layers.BatchNormalization(), + tf.keras.layers.ReLU(), + tf.keras.layers.MaxPool2D(pool_size=2), + tf.keras.layers.Conv2D(filters=20, kernel_size=5), + tf.keras.layers.BatchNormalization(), + tf.keras.layers.ReLU(), + tf.keras.layers.MaxPool2D(pool_size=2), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(units=500), + tf.keras.layers.ReLU(), + tf.keras.layers.Dense(units=10), + tf.keras.layers.Softmax() + ]) + + +def create_pruner(model, pruner_name): + pruner_class = prune_config[pruner_name]['pruner_class'] + config_list = prune_config[pruner_name]['config_list'] + return pruner_class(model, config_list) + + +def main(args): + model_name = prune_config[args.pruner_name]['model_name'] + dataset_name = prune_config[args.pruner_name]['dataset_name'] + train_set, test_set = get_dataset(dataset_name) + model = create_model(model_name) + + optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9, decay=1e-4) + model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy']) + + print('start training') + model.fit(train_set[0], train_set[1], batch_size=args.batch_size, epochs=args.pretrain_epochs, validation_data=test_set) + + print('start model pruning') + optimizer_finetune = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, decay=1e-4) + pruner = create_pruner(model, args.pruner_name) + model = pruner.compress() + model.compile(optimizer=optimizer_finetune, loss='sparse_categorical_crossentropy', metrics=['accuracy']) + model.fit(train_set[0], train_set[1], batch_size=args.batch_size, epochs=args.prune_epochs, validation_data=test_set) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--pruner_name', type=str, default='level') + parser.add_argument('--batch_size', type=int, default=256) + parser.add_argument('--pretrain_epochs', type=int, default=10) + parser.add_argument('--prune_epochs', type=int, default=10) + + args = parser.parse_args() + main(args) diff --git a/examples/model_compress/models/cifar10/resnet.py b/examples/model_compress/models/cifar10/resnet.py new file mode 100644 index 0000000000..386ff8321c --- /dev/null +++ b/examples/model_compress/models/cifar10/resnet.py @@ -0,0 +1,115 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d( + in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, in_planes, planes, stride=1): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, self.expansion * + planes, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(self.expansion*planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = F.relu(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class ResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(ResNet, self).__init__() + self.in_planes = 64 + # this layer is different from torchvision.resnet18() since this model adopted for Cifar10 + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512*block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def ResNet18(): + return ResNet(BasicBlock, [2, 2, 2, 2]) + + +def ResNet34(): + return ResNet(BasicBlock, [3, 4, 6, 3]) + + +def ResNet50(): + return ResNet(Bottleneck, [3, 4, 6, 3]) + + +def ResNet101(): + return ResNet(Bottleneck, [3, 4, 23, 3]) + + +def ResNet152(): + return ResNet(Bottleneck, [3, 8, 36, 3]) diff --git a/examples/model_compress/models/mobilenet.py b/examples/model_compress/models/mobilenet.py new file mode 100644 index 0000000000..8d60c90a4c --- /dev/null +++ b/examples/model_compress/models/mobilenet.py @@ -0,0 +1,83 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import torch.nn as nn +import math + + +def conv_bn(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU(inplace=True) + ) + + +def conv_dw(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm2d(inp), + nn.ReLU(inplace=True), + + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU(inplace=True), + ) + + +class MobileNet(nn.Module): + def __init__(self, n_class, profile='normal'): + super(MobileNet, self).__init__() + + # original + if profile == 'normal': + in_planes = 32 + cfg = [64, (128, 2), 128, (256, 2), 256, (512, 2), 512, 512, 512, 512, 512, (1024, 2), 1024] + # 0.5 AMC + elif profile == '0.5flops': + in_planes = 24 + cfg = [48, (96, 2), 80, (192, 2), 200, (328, 2), 352, 368, 360, 328, 400, (736, 2), 752] + else: + raise NotImplementedError + + self.conv1 = conv_bn(3, in_planes, stride=2) + + self.features = self._make_layers(in_planes, cfg, conv_dw) + + self.classifier = nn.Sequential( + nn.Linear(cfg[-1], n_class), + ) + + self._initialize_weights() + + def forward(self, x): + x = self.conv1(x) + x = self.features(x) + x = x.mean(3).mean(2) # global average pooling + + x = self.classifier(x) + return x + + def _make_layers(self, in_planes, cfg, layer): + layers = [] + for x in cfg: + out_planes = x if isinstance(x, int) else x[0] + stride = 1 if isinstance(x, int) else x[1] + layers.append(layer(in_planes, out_planes, stride)) + in_planes = out_planes + return nn.Sequential(*layers) + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() diff --git a/examples/model_compress/models/mobilenet_v2.py b/examples/model_compress/models/mobilenet_v2.py new file mode 100644 index 0000000000..b77e85e60b --- /dev/null +++ b/examples/model_compress/models/mobilenet_v2.py @@ -0,0 +1,128 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import torch.nn as nn +import math + + +def conv_bn(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(inplace=True) + ) + + +def conv_1x1_bn(inp, oup): + return nn.Sequential( + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(inplace=True) + ) + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, expand_ratio): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = round(inp * expand_ratio) + self.use_res_connect = self.stride == 1 and inp == oup + + if expand_ratio == 1: + self.conv = nn.Sequential( + # dw + nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ) + else: + self.conv = nn.Sequential( + # pw + nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # dw + nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2(nn.Module): + def __init__(self, n_class=1000, input_size=224, width_mult=1.): + super(MobileNetV2, self).__init__() + block = InvertedResidual + input_channel = 32 + last_channel = 1280 + interverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + [6, 96, 3, 1], + [6, 160, 3, 2], + [6, 320, 1, 1], + ] + + # building first layer + assert input_size % 32 == 0 + input_channel = int(input_channel * width_mult) + self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel + self.features = [conv_bn(3, input_channel, 2)] + # building inverted residual blocks + for t, c, n, s in interverted_residual_setting: + output_channel = int(c * width_mult) + for i in range(n): + if i == 0: + self.features.append(block(input_channel, output_channel, s, expand_ratio=t)) + else: + self.features.append(block(input_channel, output_channel, 1, expand_ratio=t)) + input_channel = output_channel + # building last several layers + self.features.append(conv_1x1_bn(input_channel, self.last_channel)) + # make it nn.Sequential + self.features = nn.Sequential(*self.features) + + # building classifier + self.classifier = nn.Sequential( + nn.Dropout(0.2), + nn.Linear(self.last_channel, n_class), + ) + + self._initialize_weights() + + def forward(self, x): + x = self.features(x) + x = x.mean(3).mean(2) + x = self.classifier(x) + return x + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() diff --git a/setup.py b/setup.py index 30d4f448c6..fc86bbc954 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ def read(fname): 'schema', 'PythonWebHDFS', 'colorama', - 'scikit-learn>=0.20,<0.22', + 'scikit-learn>=0.23.2', 'pkginfo', 'websockets' ], diff --git a/src/nni_manager/common/manager.ts b/src/nni_manager/common/manager.ts index f37745de16..c003598abc 100644 --- a/src/nni_manager/common/manager.ts +++ b/src/nni_manager/common/manager.ts @@ -4,7 +4,7 @@ 'use strict'; import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore'; -import { TrialJobStatus } from './trainingService'; +import { TrialJobStatus, LogType } from './trainingService'; type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM'; type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL'; @@ -101,6 +101,8 @@ abstract class Manager { public abstract getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise; public abstract getLatestMetricData(): Promise; + public abstract getTrialLog(trialJobId: string, logType: LogType): Promise; + public abstract getTrialJobStatistics(): Promise; public abstract getStatus(): NNIManagerStatus; } diff --git a/src/nni_manager/common/trainingService.ts b/src/nni_manager/common/trainingService.ts index 83bd51e884..4edcf16ab6 100644 --- a/src/nni_manager/common/trainingService.ts +++ b/src/nni_manager/common/trainingService.ts @@ -8,6 +8,8 @@ */ type TrialJobStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED' | 'SYS_CANCELED' | 'EARLY_STOPPED'; +type LogType = 'TRIAL_LOG' | 'TRIAL_ERROR'; + interface TrainingServiceMetadata { readonly key: string; readonly value: string; @@ -79,6 +81,7 @@ abstract class TrainingService { public abstract updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise; public abstract get isMultiPhaseJobSupported(): boolean; public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise; + public abstract getTrialLog(trialJobId: string, logType: LogType): Promise; public abstract setClusterMetadata(key: string, value: string): Promise; public abstract getClusterMetadata(key: string): Promise; public abstract cleanUp(): Promise; @@ -98,5 +101,5 @@ class NNIManagerIpConfig { export { TrainingService, TrainingServiceError, TrialJobStatus, TrialJobApplicationForm, TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters, - NNIManagerIpConfig + NNIManagerIpConfig, LogType }; diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 038fe9ef9a..ad243f4835 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -16,7 +16,7 @@ import { NNIManagerStatus, ProfileUpdateType, TrialJobStatistics } from '../common/manager'; import { - TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus + TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType } from '../common/trainingService'; import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils'; import { @@ -325,6 +325,10 @@ class NNIManager implements Manager { // FIXME: unit test } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + return this.trainingService.getTrialLog(trialJobId, logType); + } + public getExperimentProfile(): Promise { // TO DO: using Promise.resolve() const deferred: Deferred = new Deferred(); diff --git a/src/nni_manager/core/test/mockedTrainingService.ts b/src/nni_manager/core/test/mockedTrainingService.ts index 546a36e494..5dfec86427 100644 --- a/src/nni_manager/core/test/mockedTrainingService.ts +++ b/src/nni_manager/core/test/mockedTrainingService.ts @@ -7,7 +7,7 @@ import { Deferred } from 'ts-deferred'; import { Provider } from 'typescript-ioc'; import { MethodNotImplementedError } from '../../common/errors'; -import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; +import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; const testTrainingServiceProvider: Provider = { get: () => { return new MockedTrainingService(); } @@ -63,6 +63,10 @@ class MockedTrainingService extends TrainingService { return deferred.promise; } + public getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + async run(): Promise { } diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index 457f154b69..af44d71a01 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -57,6 +57,7 @@ class NNIRestHandler { this.getMetricData(router); this.getMetricDataByRange(router); this.getLatestMetricData(router); + this.getTrialLog(router); this.exportData(router); // Express-joi-validator configuration @@ -268,6 +269,19 @@ class NNIRestHandler { }); } + private getTrialLog(router: Router): void { + router.get('/trial-log/:id/:type', async(req: Request, res: Response) => { + this.nniManager.getTrialLog(req.params.id, req.params.type).then((log: string) => { + if (log === '') { + log = 'No logs available.' + } + res.send(log); + }).catch((err: Error) => { + this.handleError(err, res); + }); + }); + } + private exportData(router: Router): void { router.get('/export-data', (req: Request, res: Response) => { this.nniManager.exportData().then((exportedData: string) => { diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index a480501a79..cb1a1282e7 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -39,7 +39,6 @@ export namespace ValidationSchemas { nniManagerNFSMountPath: joi.string().min(1), containerNFSMountPath: joi.string().min(1), paiConfigPath: joi.string(), - computeTarget: joi.string(), nodeCount: joi.number(), paiStorageConfigName: joi.string().min(1), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), @@ -103,7 +102,6 @@ export namespace ValidationSchemas { }), pai_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase userName: joi.string().min(1).required(), - passWord: joi.string().min(1), token: joi.string().min(1), host: joi.string().min(1).required(), reuse: joi.boolean(), @@ -160,7 +158,10 @@ export namespace ValidationSchemas { aml_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase subscriptionId: joi.string().min(1), resourceGroup: joi.string().min(1), - workspaceName: joi.string().min(1) + workspaceName: joi.string().min(1), + computeTarget: joi.string().min(1), + maxTrialNumPerGpu: joi.number(), + useActiveGpu: joi.boolean() }), nni_manager_ip: joi.object({ // eslint-disable-line @typescript-eslint/camelcase nniManagerIp: joi.string().min(1) diff --git a/src/nni_manager/rest_server/test/mockedNNIManager.ts b/src/nni_manager/rest_server/test/mockedNNIManager.ts index 5c8bc267b7..e45819d6cb 100644 --- a/src/nni_manager/rest_server/test/mockedNNIManager.ts +++ b/src/nni_manager/rest_server/test/mockedNNIManager.ts @@ -13,7 +13,7 @@ import { TrialJobStatistics, NNIManagerStatus } from '../../common/manager'; import { - TrialJobApplicationForm, TrialJobDetail, TrialJobStatus + TrialJobApplicationForm, TrialJobDetail, TrialJobStatus, LogType } from '../../common/trainingService'; export const testManagerProvider: Provider = { @@ -118,6 +118,9 @@ export class MockedNNIManager extends Manager { public getLatestMetricData(): Promise { throw new MethodNotImplementedError(); } + public getTrialLog(trialJobId: string, logType: LogType): Promise { + throw new MethodNotImplementedError(); + } public getExperimentProfile(): Promise { const profile: ExperimentProfile = { params: { diff --git a/src/nni_manager/training_service/dlts/dltsTrainingService.ts b/src/nni_manager/training_service/dlts/dltsTrainingService.ts index ba707fbb13..30d8fbcf8d 100644 --- a/src/nni_manager/training_service/dlts/dltsTrainingService.ts +++ b/src/nni_manager/training_service/dlts/dltsTrainingService.ts @@ -12,9 +12,10 @@ import { EventEmitter } from 'events'; import { String } from 'typescript-string-operations'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { DLTS_TRIAL_COMMAND_FORMAT } from './dltsData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; @@ -246,6 +247,10 @@ class DLTSTrainingService implements TrainingService { return trialJob } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.metricsEmitter.on('metric', listener); } diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index f21ac9ad69..11a54c453c 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -12,8 +12,9 @@ import { Base64 } from 'js-base64'; import { String } from 'typescript-string-operations'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { - NNIManagerIpConfig, TrialJobDetail, TrialJobMetric + NNIManagerIpConfig, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; import { AzureStorageClientUtility } from './azureStorageClientUtils'; @@ -98,6 +99,10 @@ abstract class KubernetesTrainingService { return Promise.resolve(kubernetesTrialJob); } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.metricsEmitter.on('metric', listener); } diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 71a1c5719c..a69bff8df8 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -14,7 +14,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { HyperParameters, TrainingService, TrialJobApplicationForm, - TrialJobDetail, TrialJobMetric, TrialJobStatus + TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, getNewLine, isAlive, uniqueString @@ -184,6 +184,18 @@ class LocalTrainingService implements TrainingService { return trialJob; } + public async getTrialLog(trialJobId: string, logType: LogType): Promise { + let logPath: string; + if (logType === 'TRIAL_LOG') { + logPath = path.join(this.rootDir, 'trials', trialJobId, 'trial.log'); + } else if (logType === 'TRIAL_ERROR') { + logPath = path.join(this.rootDir, 'trials', trialJobId, 'stderr'); + } else { + throw new Error('unexpected log type'); + } + return fs.promises.readFile(logPath, 'utf8'); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.eventEmitter.on('metric', listener); } @@ -450,8 +462,8 @@ class LocalTrainingService implements TrainingService { while (!this.stopping) { while (!this.stopping && this.jobQueue.length !== 0) { const trialJobId: string = this.jobQueue[0]; - const trialJobDeatil: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId); - if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING') { + const trialJobDetail: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId); + if (trialJobDetail !== undefined && trialJobDetail.status === 'WAITING') { const [success, resource] = this.tryGetAvailableResource(); if (!success) { break; diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index 2590547849..5f6ccf4d9c 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -52,7 +52,7 @@ export class PAIJobInfoCollector { // Rest call to get PAI job info and update status // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const getJobInfoRequest: request.Options = { - uri: `${protocol}://${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`, + uri: `${protocol}://${paiClusterConfig.host}/rest-server/api/v2/jobs/${paiClusterConfig.userName}~${paiTrialJob.paiJobName}`, method: 'GET', json: true, headers: { @@ -63,8 +63,9 @@ export class PAIJobInfoCollector { //TODO : pass in request timeout param? request(getJobInfoRequest, (error: Error, response: request.Response, _body: any) => { - if ((error !== undefined && error !== null) || response.statusCode >= 500) { - this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`); + // Status code 200 for success + if ((error !== undefined && error !== null) || response.statusCode >= 400) { + // The job refresh time could be ealier than job submission, so it might return 404 error code, need refactor // Queried PAI job info failed, set job status to UNKNOWN if (paiTrialJob.status === 'WAITING' || paiTrialJob.status === 'RUNNING') { paiTrialJob.status = 'UNKNOWN'; diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 59bd994535..6d6169a6b2 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -55,12 +55,7 @@ class PAIK8STrainingService extends PAITrainingService { this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService)); this.paiClusterConfig = JSON.parse(value); this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); - if (this.paiClusterConfig.passWord) { - // Get PAI authentication token - await this.updatePaiToken(); - } else if (this.paiClusterConfig.token) { - this.paiToken = this.paiClusterConfig.token; - } + this.paiToken = this.paiClusterConfig.token; break; case TrialConfigMetadataKey.TRIAL_CONFIG: { @@ -290,18 +285,20 @@ class PAIK8STrainingService extends PAITrainingService { uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, method: 'POST', body: paiJobConfig, + followAllRedirects: true, headers: { 'Content-Type': 'text/yaml', Authorization: `Bearer ${this.paiToken}` } }; request(submitJobRequest, (error: Error, response: request.Response, body: any) => { + // If submit success, will get status code 202. refer: https://github.com/microsoft/pai/blob/master/src/rest-server/docs/swagger.yaml if ((error !== undefined && error !== null) || response.statusCode >= 400) { const errorMessage: string = (error !== undefined && error !== null) ? error.message : `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${body}`; - this.log.error(errorMessage); trialJobDetail.status = 'FAILED'; + deferred.reject(errorMessage); } else { trialJobDetail.submitTime = Date.now(); } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index e26c16ecee..56756b708d 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -11,9 +11,10 @@ import { EventEmitter } from 'events'; import { Deferred } from 'ts-deferred'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; +import { MethodNotImplementedError } from '../../common/errors'; import { NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay } from '../../common/utils'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; @@ -117,6 +118,10 @@ abstract class PAITrainingService implements TrainingService { return jobs; } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public async getTrialJob(trialJobId: string): Promise { if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); @@ -162,8 +167,7 @@ abstract class PAITrainingService implements TrainingService { } const stopJobRequest: request.Options = { - uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\ -/jobs/${trialJobDetail.paiJobName}/executionType`, + uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs/${this.paiClusterConfig.userName}~${trialJobDetail.paiJobName}/executionType`, method: 'PUT', json: true, body: { value: 'STOP' }, @@ -178,6 +182,7 @@ abstract class PAITrainingService implements TrainingService { const deferred: Deferred = new Deferred(); request(stopJobRequest, (error: Error, response: request.Response, _body: any) => { + // Status code 202 for success. if ((error !== undefined && error !== null) || response.statusCode >= 400) { this.log.error(`PAI Training service: stop trial ${trialJobId} to PAI Cluster failed!`); deferred.reject((error !== undefined && error !== null) ? error.message : diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index c997b03a01..8736bc09b7 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -10,13 +10,13 @@ import * as path from 'path'; import { ShellExecutor } from 'training_service/remote_machine/shellExecutor'; import { Deferred } from 'ts-deferred'; import * as component from '../../common/component'; -import { NNIError, NNIErrorNames } from '../../common/errors'; +import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { ObservableTimer } from '../../common/observableTimer'; import { HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, - TrialJobDetail, TrialJobMetric + TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, @@ -57,6 +57,7 @@ class RemoteMachineTrainingService implements TrainingService { private nniManagerIpConfig?: NNIManagerIpConfig; private versionCheck: boolean = true; private logCollection: string; + private sshConnectionPromises: any[]; constructor(@component.Inject timer: ObservableTimer) { this.metricsEmitter = new EventEmitter(); @@ -65,6 +66,7 @@ class RemoteMachineTrainingService implements TrainingService { this.machineCopyExpCodeDirPromiseMap = new Map>(); this.machineExecutorManagerMap = new Map(); this.jobQueue = []; + this.sshConnectionPromises = []; this.expRootDir = getExperimentRootDir(); this.timer = timer; this.log = getLogger(); @@ -80,6 +82,12 @@ class RemoteMachineTrainingService implements TrainingService { await restServer.start(); restServer.setEnableVersionCheck = this.versionCheck; this.log.info('Run remote machine training service.'); + if (this.sshConnectionPromises.length > 0) { + await Promise.all(this.sshConnectionPromises); + this.log.info('ssh connection initialized!'); + // set sshConnectionPromises to [] to avoid log information duplicated + this.sshConnectionPromises = []; + } while (!this.stopping) { while (this.jobQueue.length > 0) { this.updateGpuReservation(); @@ -172,6 +180,15 @@ class RemoteMachineTrainingService implements TrainingService { } } + /** + * Get trial job log + * @param _trialJobId ID of trial job + * @param _logType 'TRIAL_LOG' | 'TRIAL_STDERR' + */ + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + /** * Add job metrics listener * @param listener callback listener @@ -408,7 +425,6 @@ class RemoteMachineTrainingService implements TrainingService { //TO DO: verify if value's format is wrong, and json parse failed, how to handle error const rmMetaList: RemoteMachineMeta[] = JSON.parse(machineList); - const connectionPromises = []; for (const rmMeta of rmMetaList) { rmMeta.occupiedGpuIndexMap = new Map(); const executorManager: ExecutorManager = new ExecutorManager(rmMeta); @@ -417,11 +433,9 @@ class RemoteMachineTrainingService implements TrainingService { this.log.debug(`reached ${executor.name}`); this.machineExecutorManagerMap.set(rmMeta, executorManager); this.log.debug(`initializing ${executor.name}`); - connectionPromises.push(this.initRemoteMachineOnConnected(rmMeta, executor)); - this.log.info(`connected to ${executor.name}`); + this.sshConnectionPromises.push(this.initRemoteMachineOnConnected(rmMeta, executor)); + this.log.info(`connecting to ${executor.name}`); } - - await Promise.all(connectionPromises); } private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, executor: ShellExecutor): Promise { diff --git a/src/nni_manager/training_service/reusable/aml/amlConfig.ts b/src/nni_manager/training_service/reusable/aml/amlConfig.ts index dd8c2345d4..eceea3f6db 100644 --- a/src/nni_manager/training_service/reusable/aml/amlConfig.ts +++ b/src/nni_manager/training_service/reusable/aml/amlConfig.ts @@ -11,11 +11,18 @@ export class AMLClusterConfig { public readonly subscriptionId: string; public readonly resourceGroup: string; public readonly workspaceName: string; + public readonly computeTarget: string; + public useActiveGpu?: boolean; + public maxTrialNumPerGpu?: number; - constructor(subscriptionId: string, resourceGroup: string, workspaceName: string) { + constructor(subscriptionId: string, resourceGroup: string, workspaceName: string, computeTarget: string, + useActiveGpu?: boolean, maxTrialNumPerGpu?: number) { this.subscriptionId = subscriptionId; this.resourceGroup = resourceGroup; this.workspaceName = workspaceName; + this.computeTarget = computeTarget; + this.useActiveGpu = useActiveGpu; + this.maxTrialNumPerGpu = maxTrialNumPerGpu; } } @@ -23,14 +30,12 @@ export class AMLTrialConfig extends TrialConfig { public readonly image: string; public readonly command: string; public readonly codeDir: string; - public readonly computeTarget: string; - constructor(codeDir: string, command: string, image: string, computeTarget: string) { + constructor(codeDir: string, command: string, image: string) { super("", codeDir, 0); this.codeDir = codeDir; this.command = command; this.image = image; - this.computeTarget = computeTarget; } } diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index fea393e75d..7a26d0e21c 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -16,7 +16,7 @@ import { AMLClient } from '../aml/amlClient'; import { AMLClusterConfig, AMLEnvironmentInformation, AMLTrialConfig } from '../aml/amlConfig'; import { AMLCommandChannel } from '../channels/amlCommandChannel'; import { CommandChannel } from "../commandChannel"; -import { EnvironmentInformation, EnvironmentService, EnvironmentStatus } from '../environment'; +import { EnvironmentInformation, EnvironmentService } from '../environment'; /** @@ -74,7 +74,7 @@ export class AMLEnvironmentService extends EnvironmentService { environments.forEach(async (environment) => { const amlClient = (environment as AMLEnvironmentInformation).amlClient; if (!amlClient) { - throw new Error('AML client not initialized!'); + return Promise.reject('AML client not initialized!'); } const newStatus = await amlClient.updateStatus(environment.status); switch (newStatus.toUpperCase()) { @@ -90,8 +90,8 @@ export class AMLEnvironmentService extends EnvironmentService { environment.setStatus('SUCCEEDED'); break; case 'FAILED': - environment.setStatus(newStatus.toUpperCase() as EnvironmentStatus); - break; + environment.setStatus('FAILED'); + return Promise.reject(`AML: job ${environment.envId} is failed!`); case 'STOPPED': case 'STOPPING': environment.setStatus('USER_CANCELED'); @@ -112,13 +112,15 @@ export class AMLEnvironmentService extends EnvironmentService { const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation; const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); environment.command = `import os\nos.system('${amlEnvironment.command}')`; + environment.useActiveGpu = this.amlClusterConfig.useActiveGpu; + environment.maxTrialNumberPerGpu = this.amlClusterConfig.maxTrialNumPerGpu; await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' }); const amlClient = new AMLClient( this.amlClusterConfig.subscriptionId, this.amlClusterConfig.resourceGroup, this.amlClusterConfig.workspaceName, this.experimentId, - this.amlTrialConfig.computeTarget, + this.amlClusterConfig.computeTarget, this.amlTrialConfig.image, 'nni_script.py', environmentLocalTempFolder diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 3d92df5c99..596c81dbe9 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -28,15 +28,12 @@ export class OpenPaiEnvironmentService extends EnvironmentService { private paiTrialConfig: NNIPAIK8STrialConfig | undefined; private paiJobConfig: any; private paiToken?: string; - private paiTokenUpdateTime?: number; - private readonly paiTokenUpdateInterval: number; private protocol: string = 'http'; private experimentId: string; constructor() { super(); - this.paiTokenUpdateInterval = 7200000; //2hours this.experimentId = getExperimentId(); } @@ -53,12 +50,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: this.paiClusterConfig = JSON.parse(value); this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); - if (this.paiClusterConfig.passWord) { - // Get PAI authentication token - await this.updatePaiToken(); - } else if (this.paiClusterConfig.token) { - this.paiToken = this.paiClusterConfig.token; - } + this.paiToken = this.paiClusterConfig.token; break; case TrialConfigMetadataKey.TRIAL_CONFIG: { @@ -95,7 +87,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { const deferred: Deferred = new Deferred(); - await this.refreshPlatform(); if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); @@ -115,9 +106,12 @@ export class OpenPaiEnvironmentService extends EnvironmentService { }; request(getJobInfoRequest, async (error: any, response: request.Response, body: any) => { + // Status code 200 for success if ((error !== undefined && error !== null) || response.statusCode >= 400) { - this.log.error(`OpenPAI: get environment list from PAI Cluster failed!\nerror: ${error}`); - deferred.reject(error); + const errorMessage: string = (error !== undefined && error !== null) ? error.message : + `OpenPAI: get environment list from PAI Cluster failed!, http code:${response.statusCode}, http body: ${JSON.stringify(body)}`; + this.log.error(`${errorMessage}`); + deferred.reject(errorMessage); } else { const jobInfos = new Map(); body.forEach((jobInfo: any) => { @@ -133,8 +127,11 @@ export class OpenPaiEnvironmentService extends EnvironmentService { case 'RUNNING': case 'WAITING': case 'SUCCEEDED': + environment.setStatus(jobResponse.state); + break; case 'FAILED': environment.setStatus(jobResponse.state); + deferred.reject(`OpenPAI: job ${environment.envId} is failed!`); break; case 'STOPPED': case 'STOPPING': @@ -166,8 +163,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { public async startEnvironment(environment: EnvironmentInformation): Promise { const deferred: Deferred = new Deferred(); - await this.refreshPlatform(); - if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } @@ -195,18 +190,21 @@ export class OpenPaiEnvironmentService extends EnvironmentService { uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, method: 'POST', body: paiJobConfig, + followAllRedirects: true, headers: { 'Content-Type': 'text/yaml', Authorization: `Bearer ${this.paiToken}` } }; request(submitJobRequest, (error, response, body) => { + // Status code 202 for success, refer https://github.com/microsoft/pai/blob/master/src/rest-server/docs/swagger.yaml if ((error !== undefined && error !== null) || response.statusCode >= 400) { const errorMessage: string = (error !== undefined && error !== null) ? error.message : `start environment ${environment.envId} failed, http code:${response.statusCode}, http body: ${body}`; this.log.error(errorMessage); environment.status = 'FAILED'; + deferred.reject(errorMessage); } deferred.resolve(); }); @@ -241,8 +239,11 @@ export class OpenPaiEnvironmentService extends EnvironmentService { try { request(stopJobRequest, (error, response, _body) => { try { + // Status code 202 for success. if ((error !== undefined && error !== null) || (response && response.statusCode >= 400)) { - this.log.error(`OpenPAI: stop job ${environment.envId} failed with ${response.statusCode}\n${error}`); + const errorMessage: string = (error !== undefined && error !== null) ? error.message : + `OpenPAI: stop job ${environment.envId} failed, http code:${response.statusCode}, http body: ${_body}`; + this.log.error(`${errorMessage}`); deferred.reject((error !== undefined && error !== null) ? error : `Stop trial failed, http code: ${response.statusCode}`); } else { @@ -262,19 +263,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { return deferred.promise; } - private async refreshPlatform(): Promise { - if (this.paiClusterConfig && this.paiClusterConfig.passWord) { - try { - await this.updatePaiToken(); - } catch (error) { - this.log.error(`${error}`); - if (this.paiToken === undefined) { - throw new Error(error); - } - } - } - } - private generateJobConfigInYamlFormat(environment: EnvironmentInformation): any { if (this.paiTrialConfig === undefined) { throw new Error('trial config is not initialized'); @@ -386,59 +374,4 @@ export class OpenPaiEnvironmentService extends EnvironmentService { return host; } } - /** - * Update pai token by the interval time or initialize the pai token - */ - protected async updatePaiToken(): Promise { - const deferred: Deferred = new Deferred(); - - const currentTime: number = new Date().getTime(); - //If pai token initialized and not reach the interval time, do not update - if (this.paiTokenUpdateTime !== undefined && (currentTime - this.paiTokenUpdateTime) < this.paiTokenUpdateInterval) { - return Promise.resolve(); - } - - if (this.paiClusterConfig === undefined) { - const paiClusterConfigError: string = `pai cluster config not initialized!`; - this.log.error(`${paiClusterConfigError}`); - throw Error(`${paiClusterConfigError}`); - } - - const authenticationReq: request.Options = { - uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/token`, - method: 'POST', - json: true, - body: { - username: this.paiClusterConfig.userName, - password: this.paiClusterConfig.passWord - } - }; - - request(authenticationReq, (error: any, response: request.Response, body: any) => { - if (error !== undefined && error !== null) { - this.log.error(`Get PAI token failed: ${error.message}, authenticationReq: ${authenticationReq}`); - deferred.reject(new Error(`Get PAI token failed: ${error.message}`)); - } else { - if (response.statusCode !== 200) { - this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}, authenticationReq: ${authenticationReq}`); - deferred.reject(new Error(`Get PAI token failed code: ${response.statusCode}, body: ${response.body}, authenticationReq: ${authenticationReq}, please check paiConfig username or password`)); - } else { - this.paiToken = body.token; - this.paiTokenUpdateTime = new Date().getTime(); - deferred.resolve(); - } - } - }); - - let timeoutId: NodeJS.Timer; - const timeoutDelay: Promise = new Promise((_resolve: Function, reject: Function): void => { - // Set timeout and reject the promise once reach timeout (5 seconds) - timeoutId = setTimeout( - () => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')), - 5000); - }); - - return Promise.race([timeoutDelay, deferred.promise]) - .finally(() => { clearTimeout(timeoutId); }); - } } diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 1fd28604be..1e3b75cc86 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -6,7 +6,8 @@ import { Container, Scope } from 'typescript-ioc'; import * as component from '../../common/component'; import { getLogger, Logger } from '../../common/log'; -import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; +import { MethodNotImplementedError } from '../../common/errors' +import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService'; import { delay } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../pai/paiConfig'; @@ -47,6 +48,10 @@ class RouterTrainingService implements TrainingService { return await this.internalTrainingService.getTrialJob(trialJobId); } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { if (this.internalTrainingService === undefined) { throw new Error("TrainingService is not assigned!"); diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 1b310ef9e0..046f389ca2 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -9,10 +9,10 @@ import * as path from 'path'; import { Writable } from 'stream'; import { String } from 'typescript-string-operations'; import * as component from '../../common/component'; -import { NNIError, NNIErrorNames } from '../../common/errors'; +import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors'; import { getBasePort, getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; -import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; +import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService'; import { delay, getExperimentRootDir, getIPV4Address, getLogLevel, getVersion, mkDirPSync, uniqueString } from '../../common/utils'; import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands'; import { ScheduleResultType } from '../../training_service/common/gpuData'; @@ -111,6 +111,10 @@ class TrialDispatcher implements TrainingService { return trial; } + public async getTrialLog(_trialJobId: string, _logType: LogType): Promise { + throw new MethodNotImplementedError(); + } + public async submitTrialJob(form: TrialJobApplicationForm): Promise { if (this.trialConfig === undefined) { throw new Error(`trialConfig not initialized!`); diff --git a/src/nni_manager/training_service/test/localTrainingService.test.ts b/src/nni_manager/training_service/test/localTrainingService.test.ts index bc47e747ba..fbaaedcd41 100644 --- a/src/nni_manager/training_service/test/localTrainingService.test.ts +++ b/src/nni_manager/training_service/test/localTrainingService.test.ts @@ -3,14 +3,14 @@ 'use strict'; -import * as assert from 'assert'; import * as chai from 'chai'; import * as chaiAsPromised from 'chai-as-promised'; import * as fs from 'fs'; +import * as path from 'path'; import * as tmp from 'tmp'; import * as component from '../../common/component'; -import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService'; -import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils'; +import { TrialJobApplicationForm, TrialJobDetail} from '../../common/trainingService'; +import { cleanupUnitTest, delay, prepareUnitTest, getExperimentRootDir } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { LocalTrainingService } from '../local/localTrainingService'; @@ -72,6 +72,36 @@ describe('Unit Test for LocalTrainingService', () => { chai.expect(jobDetail.status).to.be.equals('USER_CANCELED'); }).timeout(20000); + it('Get trial log', async () => { + await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig); + + // submit job + const form: TrialJobApplicationForm = { + sequenceId: 0, + hyperParameters: { + value: 'mock hyperparameters', + index: 0 + } + }; + + const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form); + + // get trial log + const rootDir: string = getExperimentRootDir() + fs.mkdirSync(path.join(rootDir, 'trials')) + fs.mkdirSync(jobDetail.workingDirectory) + fs.writeFileSync(path.join(jobDetail.workingDirectory, 'trial.log'), 'trial log') + fs.writeFileSync(path.join(jobDetail.workingDirectory, 'stderr'), 'trial stderr') + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.equals('trial log'); + chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_ERROR')).to.be.equals('trial stderr'); + fs.unlinkSync(path.join(jobDetail.workingDirectory, 'trial.log')) + fs.unlinkSync(path.join(jobDetail.workingDirectory, 'stderr')) + fs.rmdirSync(jobDetail.workingDirectory) + fs.rmdirSync(path.join(rootDir, 'trials')) + + await localTrainingService.cancelTrialJob(jobDetail.id); + }).timeout(20000); + it('Read metrics, Add listener, and remove listener', async () => { // set meta data const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}` diff --git a/src/sdk/pycli/nnicli/nni_client.py b/src/sdk/pycli/nnicli/nni_client.py index ca083c2c7f..571e2bd036 100644 --- a/src/sdk/pycli/nnicli/nni_client.py +++ b/src/sdk/pycli/nnicli/nni_client.py @@ -5,67 +5,47 @@ Example: -import nnicli as nc +from nnicli import Experiment -nc.start_nni('../../../../examples/trials/mnist/config.yml') +exp = Experiment() +exp.start_experiment('../../../../examples/trials/mnist-pytorch/config.yml') -nc.set_endpoint('http://localhost:8080') +exp.update_concurrency(3) -print(nc.version()) -print(nc.get_experiment_status()) +print(exp.get_experiment_status()) +print(exp.get_job_statistics()) +print(exp.list_trial_jobs()) -print(nc.get_job_statistics()) -print(nc.list_trial_jobs()) - -nc.stop_nni() +exp.stop_experiment() """ import sys import os import subprocess +import re +import json import requests __all__ = [ - 'start_nni', - 'stop_nni', - 'set_endpoint', - 'version', - 'get_experiment_status', - 'get_experiment_profile', - 'get_trial_job', - 'list_trial_jobs', - 'get_job_statistics', - 'get_job_metrics', - 'export_data' + 'Experiment', + 'TrialResult', + 'TrialMetricData', + 'TrialHyperParameters', + 'TrialJob' ] EXPERIMENT_PATH = 'experiment' -VERSION_PATH = 'version' STATUS_PATH = 'check-status' JOB_STATISTICS_PATH = 'job-statistics' TRIAL_JOBS_PATH = 'trial-jobs' METRICS_PATH = 'metric-data' EXPORT_DATA_PATH = 'export-data' - API_ROOT_PATH = 'api/v1/nni' -_api_endpoint = None - -def set_endpoint(endpoint): - """set endpoint of nni rest server for nnicli, for example: - http://localhost:8080 - """ - global _api_endpoint - _api_endpoint = endpoint - -def _check_endpoint(): - if _api_endpoint is None: - raise AssertionError("Please call set_endpoint to specify nni endpoint") - -def _nni_rest_get(api_path, response_type='json'): - _check_endpoint() - uri = '{}/{}/{}'.format(_api_endpoint, API_ROOT_PATH, api_path) +def _nni_rest_get(endpoint, api_path, response_type='json'): + _check_endpoint(endpoint) + uri = '{}/{}/{}'.format(endpoint.strip('/'), API_ROOT_PATH, api_path) res = requests.get(uri) if _http_succeed(res.status_code): if response_type == 'json': @@ -73,7 +53,7 @@ def _nni_rest_get(api_path, response_type='json'): elif response_type == 'text': return res.text else: - raise AssertionError('Incorrect response_type') + raise RuntimeError('Incorrect response_type') else: return None @@ -92,48 +72,444 @@ def _create_process(cmd): print(output.decode('utf-8').strip()) return process.returncode -def start_nni(config_file): - """start nni experiment with specified configuration file""" - cmd = 'nnictl create --config {}'.format(config_file).split(' ') - if _create_process(cmd) != 0: - raise RuntimeError('Failed to start nni.') - -def stop_nni(): - """stop nni experiment""" - cmd = 'nnictl stop'.split(' ') - if _create_process(cmd) != 0: - raise RuntimeError('Failed to stop nni.') - -def version(): - """return version of nni""" - return _nni_rest_get(VERSION_PATH, 'text') - -def get_experiment_status(): - """return experiment status as a dict""" - return _nni_rest_get(STATUS_PATH) - -def get_experiment_profile(): - """return experiment profile as a dict""" - return _nni_rest_get(EXPERIMENT_PATH) - -def get_trial_job(trial_job_id): - """return trial job information as a dict""" - assert trial_job_id is not None - return _nni_rest_get(os.path.join(TRIAL_JOBS_PATH, trial_job_id)) - -def list_trial_jobs(): - """return information for all trial jobs as a list""" - return _nni_rest_get(TRIAL_JOBS_PATH) - -def get_job_statistics(): - """return trial job statistics information as a dict""" - return _nni_rest_get(JOB_STATISTICS_PATH) - -def get_job_metrics(trial_job_id=None): - """return trial job metrics""" - api_path = METRICS_PATH if trial_job_id is None else os.path.join(METRICS_PATH, trial_job_id) - return _nni_rest_get(api_path) - -def export_data(): - """return exported information for all trial jobs""" - return _nni_rest_get(EXPORT_DATA_PATH) +def _check_endpoint(endpoint): + if endpoint is None: + raise RuntimeError("This instance hasn't been connect to an experiment.") + +class TrialResult: + """ + TrialResult stores the result information of a trial job. + + Parameters + ---------- + json_obj: dict + Json object that stores the result information. + + Attributes + ---------- + parameter: dict + Hyper parameters for this trial. + value: serializable object, usually a number, or a dict with key "default" and other extra keys + Final result. + trialJobId: str + Trial job id. + """ + def __init__(self, json_obj): + self.parameter = None + self.value = None + self.trialJobId = None + for key in json_obj.keys(): + if key == 'id': + setattr(self, 'trialJobId', json_obj[key]) + elif hasattr(self, key): + setattr(self, key, json_obj[key]) + self.value = json.loads(self.value) + + def __repr__(self): + return "TrialResult(parameter: {} value: {} trialJobId: {})".format(self.parameter, self.value, self.trialJobId) + +class TrialMetricData: + """ + TrialMetricData stores the metric data of a trial job. + A trial job may have both intermediate metric and final metric. + + Parameters + ---------- + json_obj: dict + Json object that stores the metric data. + + Attributes + ---------- + timestamp: int + Time stamp. + trialJobId: str + Trial job id. + parameterId: int + Parameter id. + type: str + Metric type, `PERIODICAL` for intermediate result and `FINAL` for final result. + sequence: int + Sequence number in this trial. + data: serializable object, usually a number, or a dict with key "default" and other extra keys + Metric data. + """ + def __init__(self, json_obj): + self.timestamp = None + self.trialJobId = None + self.parameterId = None + self.type = None + self.sequence = None + self.data = None + for key in json_obj.keys(): + setattr(self, key, json_obj[key]) + self.data = json.loads(json.loads(self.data)) + + def __repr__(self): + return "TrialMetricData(timestamp: {} trialJobId: {} parameterId: {} type: {} sequence: {} data: {})" \ + .format(self.timestamp, self.trialJobId, self.parameterId, self.type, self.sequence, self.data) + +class TrialHyperParameters: + """ + TrialHyperParameters stores the hyper parameters of a trial job. + + Parameters + ---------- + json_obj: dict + Json object that stores the hyper parameters. + + Attributes + ---------- + parameter_id: int + Parameter id. + parameter_source: str + Parameter source. + parameters: dict + Hyper parameters. + parameter_index: int + Parameter index. + """ + def __init__(self, json_obj): + self.parameter_id = None + self.parameter_source = None + self.parameters = None + self.parameter_index = None + for key in json_obj.keys(): + if hasattr(self, key): + setattr(self, key, json_obj[key]) + + def __repr__(self): + return "TrialHyperParameters(parameter_id: {} parameter_source: {} parameters: {} parameter_index: {})" \ + .format(self.parameter_id, self.parameter_source, self.parameters, self.parameter_index) + +class TrialJob: + """ + TrialJob stores the information of a trial job. + + Parameters + ---------- + json_obj: dict + json object that stores the hyper parameters + + Attributes + ---------- + trialJobId: str + Trial job id. + status: str + Job status. + hyperParameters: list of `nnicli.TrialHyperParameters` + See `nnicli.TrialHyperParameters`. + logPath: str + Log path. + startTime: int + Job start time (timestamp). + endTime: int + Job end time (timestamp). + finalMetricData: list of `nnicli.TrialMetricData` + See `nnicli.TrialMetricData`. + parameter_index: int + Parameter index. + """ + def __init__(self, json_obj): + self.trialJobId = None + self.status = None + self.hyperParameters = None + self.logPath = None + self.startTime = None + self.endTime = None + self.finalMetricData = None + self.stderrPath = None + for key in json_obj.keys(): + if key == 'id': + setattr(self, 'trialJobId', json_obj[key]) + elif hasattr(self, key): + setattr(self, key, json_obj[key]) + if self.hyperParameters: + self.hyperParameters = [TrialHyperParameters(json.loads(e)) for e in self.hyperParameters] + if self.finalMetricData: + self.finalMetricData = [TrialMetricData(e) for e in self.finalMetricData] + + def __repr__(self): + return ("TrialJob(trialJobId: {} status: {} hyperParameters: {} logPath: {} startTime: {} " + "endTime: {} finalMetricData: {} stderrPath: {})") \ + .format(self.trialJobId, self.status, self.hyperParameters, self.logPath, + self.startTime, self.endTime, self.finalMetricData, self.stderrPath) + +class Experiment: + def __init__(self): + self._endpoint = None + self._exp_id = None + self._port = None + + @property + def endpoint(self): + return self._endpoint + + @property + def exp_id(self): + return self._exp_id + + @property + def port(self): + return self._port + + def _exec_command(self, cmd, port=None): + if self._endpoint is not None: + raise RuntimeError('This instance has been connected to an experiment.') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to establish experiment, please check your config.') + else: + if port: + self._port = port + else: + self._port = 8080 + self._endpoint = 'http://localhost:{}'.format(self._port) + self._exp_id = self.get_experiment_profile()['id'] + + def start_experiment(self, config_file, port=None, debug=False): + """ + Start an experiment with specified configuration file and connect to it. + + Parameters + ---------- + config_file: str + Path to the config file. + port: int + The port of restful server, bigger than 1024. + debug: boolean + Set debug mode. + """ + cmd = 'nnictl create --config {}'.format(config_file).split(' ') + if port: + cmd += '--port {}'.format(port).split(' ') + if debug: + cmd += ['--debug'] + self._exec_command(cmd, port) + + def resume_experiment(self, exp_id, port=None, debug=False): + """ + Resume a stopped experiment with specified experiment id + + Parameters + ---------- + exp_id: str + Experiment id. + port: int + The port of restful server, bigger than 1024. + debug: boolean + Set debug mode. + """ + cmd = 'nnictl resume {}'.format(exp_id).split(' ') + if port: + cmd += '--port {}'.format(port).split(' ') + if debug: + cmd += ['--debug'] + self._exec_command(cmd, port) + + def view_experiment(self, exp_id, port=None): + """ + View a stopped experiment with specified experiment id. + + Parameters + ---------- + exp_id: str + Experiment id. + port: int + The port of restful server, bigger than 1024. + """ + cmd = 'nnictl view {}'.format(exp_id).split(' ') + if port: + cmd += '--port {}'.format(port).split(' ') + self._exec_command(cmd, port) + + def connect_experiment(self, endpoint): + """ + Connect to an existing experiment. + + Parameters + ---------- + endpoint: str + The endpoint of nni rest server, i.e, the url of Web UI. Should be a format like `http://ip:port`. + """ + if self._endpoint is not None: + raise RuntimeError('This instance has been connected to an experiment.') + self._endpoint = endpoint + try: + self._exp_id = self.get_experiment_profile()['id'] + except TypeError: + raise RuntimeError('Invalid experiment endpoint.') + self._port = int(re.search(r':[0-9]+', self._endpoint).group().replace(':', '')) + + def stop_experiment(self): + """Stop the experiment. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl stop {}'.format(self._exp_id).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to stop experiment.') + self._endpoint = None + self._exp_id = None + self._port = None + + def update_searchspace(self, filename): + """ + Update the experiment's search space. + + Parameters + ---------- + filename: str + Path to the searchspace file. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl update searchspace {} --filename {}'.format(self._exp_id, filename).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to update searchspace.') + + def update_concurrency(self, value): + """ + Update an experiment's concurrency + + Parameters + ---------- + value: int + New concurrency value. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl update concurrency {} --value {}'.format(self._exp_id, value).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to update concurrency.') + + def update_duration(self, value): + """ + Update an experiment's duration + + Parameters + ---------- + value: str + Strings like '1m' for one minute or '2h' for two hours. + SUFFIX may be 's' for seconds, 'm' for minutes, 'h' for hours or 'd' for days. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl update duration {} --value {}'.format(self._exp_id, value).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to update duration.') + + def update_trailnum(self, value): + """ + Update an experiment's maxtrialnum + + Parameters + ---------- + value: int + New trailnum value. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl update trialnum {} --value {}'.format(self._exp_id, value).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to update trailnum.') + + def get_experiment_status(self): + """ + Return experiment status as a dict. + + Returns + ---------- + dict + Experiment status. + """ + _check_endpoint(self._endpoint) + return _nni_rest_get(self._endpoint, STATUS_PATH) + + def get_trial_job(self, trial_job_id): + """ + Return a trial job. + + Parameters + ---------- + trial_job_id: str + Trial job id. + + Returns + ---------- + nnicli.TrialJob + A `nnicli.TrialJob` instance corresponding to `trial_job_id`. + """ + _check_endpoint(self._endpoint) + assert trial_job_id is not None + trial_job = _nni_rest_get(self._endpoint, os.path.join(TRIAL_JOBS_PATH, trial_job_id)) + return TrialJob(trial_job) + + def list_trial_jobs(self): + """ + Return information for all trial jobs as a list. + + Returns + ---------- + list + List of `nnicli.TrialJob`. + """ + _check_endpoint(self._endpoint) + trial_jobs = _nni_rest_get(self._endpoint, TRIAL_JOBS_PATH) + return [TrialJob(e) for e in trial_jobs] + + def get_job_statistics(self): + """ + Return trial job statistics information as a dict. + + Returns + ---------- + list + Job statistics information. + """ + _check_endpoint(self._endpoint) + return _nni_rest_get(self._endpoint, JOB_STATISTICS_PATH) + + def get_job_metrics(self, trial_job_id=None): + """ + Return trial job metrics. + + Parameters + ---------- + trial_job_id: str + trial job id. if this parameter is None, all trail jobs' metrics will be returned. + + Returns + ---------- + dict + Each key is a trialJobId, the corresponding value is a list of `nnicli.TrialMetricData`. + """ + _check_endpoint(self._endpoint) + api_path = METRICS_PATH if trial_job_id is None else os.path.join(METRICS_PATH, trial_job_id) + output = {} + trail_metrics = _nni_rest_get(self._endpoint, api_path) + for metric in trail_metrics: + trial_id = metric["trialJobId"] + if trial_id not in output: + output[trial_id] = [TrialMetricData(metric)] + else: + output[trial_id].append(TrialMetricData(metric)) + return output + + def export_data(self): + """ + Return exported information for all trial jobs. + + Returns + ---------- + list + List of `nnicli.TrialResult`. + """ + _check_endpoint(self._endpoint) + trial_results = _nni_rest_get(self._endpoint, EXPORT_DATA_PATH) + return [TrialResult(e) for e in trial_results] + + def get_experiment_profile(self): + """ + Return experiment profile as a dict. + + Returns + ---------- + dict + The profile of the experiment. + """ + _check_endpoint(self._endpoint) + return _nni_rest_get(self._endpoint, EXPERIMENT_PATH) diff --git a/src/sdk/pynni/nni/_graph_utils.py b/src/sdk/pynni/nni/_graph_utils.py index 25e513f42e..3fa6cd0eab 100644 --- a/src/sdk/pynni/nni/_graph_utils.py +++ b/src/sdk/pynni/nni/_graph_utils.py @@ -530,8 +530,15 @@ def _is_key_func(self, node_cpp): return True if node_cpp.kind() in [LIST_UNPACK_KIND, TUPLE_UNPACK_KIND]: # We cannot merge the List/Tuple - # Construct/Unpack func into other nodes, else it + # Unpack func into other nodes, else it # may lead to a graph construction error. + # The reason why we donnot take the construct node + # also as a key node is that `cat` operation node need + # the last(previous) visited node to infer the mask. If + # we take the Construct node as the important node, the + # predecessor of the `cat` node will always be a construct + # node, which means we cannot infer the mask for the cat + # operation. return True return False @@ -556,9 +563,13 @@ def unpack_manually(self): _logger.debug('List/Tuple Construct Node(cpp) %s', str(last_cpp)) _logger.debug('List/Tuple Unpack Node(cpp) %s', str(unpack_cpp)) assert len(list(unpack_cpp.outputs())) == len(list(last_cpp.inputs())) - for _input, _output in zip(last_cpp.inputs(), unpack_cpp.outputs()): - _debug_input = _input.debugName() - _debug_output = _output.debugName() + errmsg = '%s Input number: %d if inconsistent with the output number %d' % (unpack_cpp, \ + len(node.inputs), len(list(last_cpp.inputs()))) + + assert len(node.inputs) == len(list(last_cpp.inputs())), errmsg + for _debug_input, _debug_output in zip(node.inputs, node.outputs): + # _debug_input = _input.debugName() + # _debug_output = _output.debugName() if _debug_input in self.input_to_node and _debug_output in self.input_to_node: # input_to_node[_debug_input] is a list of NodePyGroup, because # one tensor can be used as input for multiple nodes at the same time. @@ -570,10 +581,13 @@ def unpack_manually(self): self.input_to_node[_debug_input].remove(node) # add the following nodes of _output into the input_to_node[_debug_input] self.input_to_node[_debug_input].extend(self.input_to_node[_debug_output]) - if _debug_input in self.output_to_node and _debug_output in self.output_to_node: - # output_to_node[_debug_output] is a NodePyGroup, because one output - # tensor only can be generated by one node. - self.output_to_node[_debug_output] = self.output_to_node[_debug_input] + # just remove the _debug_output from the grapgh index. So that we can also skip + # the construct and tuple + if _debug_output in self.input_to_node: + for following_node in self.input_to_node[_debug_output]: + _tmp_index = following_node.inputs.index(_debug_output) + following_node.inputs[_tmp_index] = _debug_input + self.unpacked = True diff --git a/src/sdk/pynni/nni/compression/tensorflow/__init__.py b/src/sdk/pynni/nni/compression/tensorflow/__init__.py index 45b6c4e7b8..00d41ee55b 100644 --- a/src/sdk/pynni/nni/compression/tensorflow/__init__.py +++ b/src/sdk/pynni/nni/compression/tensorflow/__init__.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from .compressor import LayerInfo, Compressor, Pruner, Quantizer -from .builtin_pruners import * -from .builtin_quantizers import * +from .compressor import Compressor, Pruner +from .pruning import * diff --git a/src/sdk/pynni/nni/compression/tensorflow/builtin_pruners.py b/src/sdk/pynni/nni/compression/tensorflow/builtin_pruners.py deleted file mode 100644 index 89ea1a722d..0000000000 --- a/src/sdk/pynni/nni/compression/tensorflow/builtin_pruners.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -import numpy as np -import tensorflow as tf -from .compressor import Pruner - -__all__ = ['LevelPruner', 'AGPPruner', 'FPGMPruner'] - -_logger = logging.getLogger(__name__) - - -class LevelPruner(Pruner): - """ - Parameters - ---------- - model : tensorflow model - Model to be pruned - config_list : list - Supported keys: - - sparsity : This is to specify the sparsity operations to be compressed to. - - op_types : Operation types to prune. - """ - def __init__(self, model, config_list): - super().__init__(model, config_list) - self.mask_list = {} - self.if_init_list = {} - - def calc_mask(self, layer, config): - weight = layer.weight - op_name = layer.name - if self.if_init_list.get(op_name, True): - threshold = tf.contrib.distributions.percentile(tf.abs(weight), config['sparsity'] * 100) - mask = tf.cast(tf.math.greater(tf.abs(weight), threshold), weight.dtype) - self.mask_list.update({op_name: mask}) - self.if_init_list.update({op_name: False}) - else: - mask = self.mask_list[op_name] - return mask - - -class AGPPruner(Pruner): - """ - Parameters - ---------- - model : torch.nn.Module - Model to be pruned. - config_list : listlist - Supported keys: - - initial_sparsity: This is to specify the sparsity when compressor starts to compress. - - final_sparsity: This is to specify the sparsity when compressor finishes to compress. - - start_epoch: This is to specify the epoch number when compressor starts to compress, default start from epoch 0. - - end_epoch: This is to specify the epoch number when compressor finishes to compress. - - frequency: This is to specify every *frequency* number epochs compressor compress once, default frequency=1. - """ - - def __init__(self, model, config_list): - super().__init__(model, config_list) - self.mask_list = {} - self.if_init_list = {} - self.now_epoch = tf.Variable(0) - self.assign_handler = [] - - def calc_mask(self, layer, config): - weight = layer.weight - op_name = layer.name - start_epoch = config.get('start_epoch', 0) - freq = config.get('frequency', 1) - if self.now_epoch >= start_epoch and self.if_init_list.get(op_name, True) and ( - self.now_epoch - start_epoch) % freq == 0: - target_sparsity = self.compute_target_sparsity(config) - threshold = tf.contrib.distributions.percentile(weight, target_sparsity * 100) - # stop gradient in case gradient change the mask - mask = tf.stop_gradient(tf.cast(tf.math.greater(weight, threshold), weight.dtype)) - self.assign_handler.append(tf.assign(weight, weight * mask)) - self.mask_list.update({op_name: tf.constant(mask)}) - self.if_init_list.update({op_name: False}) - else: - mask = self.mask_list[op_name] - return mask - - def compute_target_sparsity(self, config): - end_epoch = config.get('end_epoch', 1) - start_epoch = config.get('start_epoch', 0) - freq = config.get('frequency', 1) - final_sparsity = config.get('final_sparsity', 0) - initial_sparsity = config.get('initial_sparsity', 0) - - if end_epoch <= start_epoch or initial_sparsity >= final_sparsity: - _logger.warning('your end epoch <= start epoch or initial_sparsity >= final_sparsity') - return final_sparsity - - now_epoch = tf.minimum(self.now_epoch, tf.constant(end_epoch)) - span = int(((end_epoch - start_epoch - 1) // freq) * freq) - assert span > 0 - base = tf.cast(now_epoch - start_epoch, tf.float32) / span - target_sparsity = (final_sparsity + - (initial_sparsity - final_sparsity) * - (tf.pow(1.0 - base, 3))) - return target_sparsity - - def update_epoch(self, epoch, sess): - sess.run(self.assign_handler) - sess.run(tf.assign(self.now_epoch, int(epoch))) - for k in self.if_init_list: - self.if_init_list[k] = True - - -class FPGMPruner(Pruner): - """ - Parameters - ---------- - model : tensorflow model - Model to be pruned - config_list : list - Supported keys: - - sparsity : percentage of convolutional filters to be pruned. - - op_types : Only Conv2d is supported in FPGM Pruner. - """ - def __init__(self, model, config_list): - super().__init__(model, config_list) - self.mask_dict = {} - self.assign_handler = [] - self.epoch_pruned_layers = set() - - def calc_mask(self, layer, config): - """ - Supports Conv1D, Conv2D - filter dimensions for Conv1D: - LEN: filter length - IN: number of input channel - OUT: number of output channel - - filter dimensions for Conv2D: - H: filter height - W: filter width - IN: number of input channel - OUT: number of output channel - - Parameters - ---------- - layer : LayerInfo - calculate mask for `layer`'s weight - config : dict - the configuration for generating the mask - """ - - weight = layer.weight - op_type = layer.type - op_name = layer.name - assert 0 <= config.get('sparsity') < 1 - assert op_type in ['Conv1D', 'Conv2D'] - assert op_type in config['op_types'] - - if layer.name in self.epoch_pruned_layers: - assert layer.name in self.mask_dict - return self.mask_dict.get(layer.name) - - try: - w = tf.stop_gradient(tf.transpose(tf.reshape(weight, (-1, weight.shape[-1])), [1, 0])) - masks = np.ones(w.shape) - num_filters = w.shape[0] - num_prune = int(num_filters * config.get('sparsity')) - if num_filters < 2 or num_prune < 1: - return masks - min_gm_idx = self._get_min_gm_kernel_idx(w, num_prune) - - for idx in min_gm_idx: - masks[idx] = 0. - finally: - masks = tf.reshape(tf.transpose(masks, [1, 0]), weight.shape) - masks = tf.Variable(masks) - self.mask_dict.update({op_name: masks}) - self.epoch_pruned_layers.add(layer.name) - - return masks - - def _get_min_gm_kernel_idx(self, weight, n): - dist_list = [] - for out_i in range(weight.shape[0]): - dist_sum = self._get_distance_sum(weight, out_i) - dist_list.append((dist_sum, out_i)) - min_gm_kernels = sorted(dist_list, key=lambda x: x[0])[:n] - return [x[1] for x in min_gm_kernels] - - def _get_distance_sum(self, weight, out_idx): - anchor_w = tf.tile(tf.expand_dims(weight[out_idx], 0), [weight.shape[0], 1]) - x = weight - anchor_w - x = tf.math.reduce_sum((x*x), -1) - x = tf.math.sqrt(x) - return tf.math.reduce_sum(x) - - def update_epoch(self, epoch): - self.epoch_pruned_layers = set() diff --git a/src/sdk/pynni/nni/compression/tensorflow/builtin_quantizers.py b/src/sdk/pynni/nni/compression/tensorflow/builtin_quantizers.py deleted file mode 100644 index 3f54cbfb12..0000000000 --- a/src/sdk/pynni/nni/compression/tensorflow/builtin_quantizers.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -import tensorflow as tf -from .compressor import Quantizer - -__all__ = ['NaiveQuantizer', 'QAT_Quantizer', 'DoReFaQuantizer'] - -_logger = logging.getLogger(__name__) - - -class NaiveQuantizer(Quantizer): - """quantize weight to 8 bits - """ - def __init__(self, model, config_list): - super().__init__(model, config_list) - self.layer_scale = {} - - def quantize_weight(self, weight, config, op_name, **kwargs): - new_scale = tf.reduce_max(tf.abs(weight)) / 127 - scale = tf.maximum(self.layer_scale.get(op_name, tf.constant(0.0)), new_scale) - self.layer_scale[op_name] = scale - orig_type = weight.dtype - return tf.cast(tf.cast(weight / scale, tf.int8), orig_type) * scale - - -class QAT_Quantizer(Quantizer): - """Quantizer using the Quantization and Training scheme, as defined in: - Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference - http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf - """ - def __init__(self, model, config_list): - """ - config_list: supported keys: - - q_bits - """ - super().__init__(model, config_list) - - def quantize_weight(self, weight, config, **kwargs): - a = tf.stop_gradient(tf.reduce_min(weight)) - b = tf.stop_gradient(tf.reduce_max(weight)) - n = tf.cast(2 ** config['q_bits'], tf.float32) - scale = b-a/(n-1) - - # use gradient_override_map to change round to idetity for gradient - with tf.get_default_graph().gradient_override_map({'Round': 'Identity'}): - qw = tf.round((weight-a)/scale)*scale +a - - return qw - - -class DoReFaQuantizer(Quantizer): - """Quantizer using the DoReFa scheme, as defined in: - Zhou et al., DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients - (https://arxiv.org/abs/1606.06160) - """ - def __init__(self, model, config_list): - """ - config_list: supported keys: - - q_bits - """ - super().__init__(model, config_list) - - def quantize_weight(self, weight, config, **kwargs): - a = tf.math.tanh(weight) - b = a/(2*tf.reduce_max(tf.abs(weight))) + 0.5 - - scale = pow(2, config['q_bits'] - 1) - # use gradient_override_map to change round to idetity for gradient - with tf.get_default_graph().gradient_override_map({'Round': 'Identity'}): - qw = tf.round(b*scale)/scale - r_qw = 2 * qw - 1 - return r_qw diff --git a/src/sdk/pynni/nni/compression/tensorflow/compressor.py b/src/sdk/pynni/nni/compression/tensorflow/compressor.py index 62580738a3..bbe4a21a52 100644 --- a/src/sdk/pynni/nni/compression/tensorflow/compressor.py +++ b/src/sdk/pynni/nni/compression/tensorflow/compressor.py @@ -1,204 +1,300 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +""" +Abstract base classes for TensorFlow model compression. +""" + import logging import tensorflow as tf from . import default_layers -tf.config.experimental_run_functions_eagerly(True) _logger = logging.getLogger(__name__) class LayerInfo: - def __init__(self, keras_layer): - self.keras_layer = keras_layer - self.name = keras_layer.name - self.type = default_layers.get_op_type(type(keras_layer)) - self.weight_index = default_layers.get_weight_index(self.type) - if self.weight_index is not None: - self.weight = keras_layer.weights[self.weight_index] - self._call = None + """ + This structure contains all infomation needed to compress a TensorFlow ``Layer``. + + + Attributes + ---------- + layer : tf.keras.layers.Layer + The layer. + name : str + The layer's name. Note that it's local to sub-model and may differ from its attribute name. + type : str + Name of the layer's class. + path : list of str/int + The layer object's and its parents' attribute name / list index. + For example, if the path is `['cells', 2, 'conv']`, then the layer can be accessed as `model.cells[2].conv`. + config : JSON object + Selected configuration for this layer. The format is detailed in tutorial. + + Parameters + ---------- + layer : tf.keras.layers.Layer + See attributes section. + path : list of str/int + See attributes section. + """ + + def __init__(self, layer, path=None): + self.layer = layer + self.name = layer.name + self.type = type(layer).__name__ + self.path = path + self.config = None + class Compressor: """ - Abstract base TensorFlow compressor + Common base class for all compressors. + + This class is designed for other base classes. + Algorithms should inherit ``Pruner`` or ``Quantizer`` instead. + + + Attributes + ---------- + bound_model : tf.keras.Model + Compressed user model. + wrappers : list of tf.keras.Model + A wrapper is an instrumented TF ``Layer``, in ``Model`` format. + The list is ordered by preorder traversal. + + Parameters + ---------- + LayerWrapperClass : a class derive from Model + The class used to instrument layers. + model : tf.keras.Model + The user model to be compressed. + config_list : list of JSON object + User configuration. The format is detailed in tutorial. """ - def __init__(self, model, config_list): - """ - Record necessary info in class members + def __init__(self, LayerWrapperClass, model, config_list): + assert isinstance(model, tf.keras.Model) + self.validate_config(model, config_list) - Parameters - ---------- - model : keras model - the model user wants to compress - config_list : list - the configurations that users specify for compression - """ self.bound_model = model - self.config_list = config_list - self.modules_to_compress = [] + self.wrappers = [] - def detect_modules_to_compress(self): - """ - detect all modules should be compressed, and save the result in `self.modules_to_compress`. + for layer_info in _detect_layers_to_compress(model, config_list): + self.wrappers.append(LayerWrapperClass(layer_info, self)) + if not self.wrappers: + _logger.warning('Nothing is configured to compress, please check your model and config list') - The model will be instrumented and user should never edit it after calling this method. - """ - if self.modules_to_compress is None: - self.modules_to_compress = [] - for keras_layer in self.bound_model.layers: - layer = LayerInfo(keras_layer) - config = self.select_config(layer) - if config is not None: - self.modules_to_compress.append((layer, config)) - return self.modules_to_compress + _instrument_model(model, self.wrappers) - def compress(self): + def set_wrappers_attribute(self, name, value): """ - Compress the model with algorithm implemented by subclass. - - The model will be instrumented and user should never edit it after calling this method. - `self.modules_to_compress` records all the to-be-compressed layers + Call ``setattr`` on all wrappers. """ - modules_to_compress = self.detect_modules_to_compress() - for layer, config in modules_to_compress: - self._instrument_layer(layer, config) - return self.bound_model + for wrapper in self.wrappers: + setattr(wrapper, name, value) - def get_modules_to_compress(self): - """ - To obtain all the to-be-compressed layers. - Returns - ------- - self.modules_to_compress : list - a list of the layers, each of which is a tuple (`layer`, `config`), - `layer` is `LayerInfo`, `config` is a `dict` - """ - return self.modules_to_compress +class Pruner(Compressor): + """ + Base class for pruning algorithms. - def select_config(self, layer): - """ - Find the configuration for `layer` by parsing `self.config_list` + End users should use ``compress`` and callback APIs (WIP) to prune their models. - Parameters - ---------- - layer: LayerInfo - one layer + The underlying model is instrumented upon initialization of pruner object. + So if you want to pre-train the model, train it before creating pruner object. - Returns - ------- - ret : config or None - the retrieved configuration for this layer, if None, this layer should - not be compressed - """ - ret = None - if layer.type is None: - return None - for config in self.config_list: - config = config.copy() - config['op_types'] = self._expand_config_op_types(config) - if layer.type not in config['op_types']: - continue - if config.get('op_names') and layer.name not in config['op_names']: - continue - ret = config - if ret is None or ret.get('exclude'): - return None - return ret + The compressed model can only execute in eager mode. - def update_epoch(self, epoch): - """ - If user want to update model every epoch, user can override this method. - This method should be called at the beginning of each epoch + Algorithm developers should override ``calc_masks`` method to specify pruning strategy. - Parameters - ---------- - epoch : num - the current epoch number - """ + Parameters + ---------- + model : tf.keras.Model + The user model to prune. + config_list : list of JSON object + User configuration. The format is detailed in tutorial. + """ + def __init__(self, model, config_list): + super().__init__(PrunerLayerWrapper, model, config_list) + #self.callback = PrunerCallback(self) - def step(self): - """ - If user want to update mask every step, user can override this method + def compress(self): """ + Apply compression on a pre-trained model. + If you want to prune the model during training, use callback API (WIP) instead. - def _instrument_layer(self, layer, config): + Returns + ------- + tf.keras.Model + The compressed model, for convenience. This is exactly the same object to constructor argument. """ - This method is implemented in the subclasses, i.e., `Pruner` and `Quantizer` + self._update_mask() + return self.bound_model - Parameters - ---------- - layer : LayerInfo - the layer to instrument the compression operation - config : dict - the configuration for compressing this layer + def calc_masks(self, wrapper, **kwargs): """ - raise NotImplementedError() - - def _expand_config_op_types(self, config): - if config is None: - return [] - op_types = [] - - for op_type in config.get('op_types', []): - if op_type == 'default': - op_types.extend(default_layers.default_layers) - else: - op_types.append(op_type) - return op_types + Abstract method to be overridden by algorithm. End users should ignore it. - -class Pruner(Compressor): - """ - Abstract base TensorFlow pruner - """ - - def calc_mask(self, layer, config): - """ - Pruners should overload this method to provide mask for weight tensors. - The mask must have the same shape and type comparing to the weight. - It will be applied with `mul()` operation on the weight. - This method is effectively hooked to `forward()` method of the model. + If the callback is set up, this method will be invoked at end of each training minibatch. + If not, it will only be called when end user invokes ``compress``. Parameters ---------- - layer : LayerInfo - calculate mask for `layer`'s weight - config : dict - the configuration for generating the mask - """ - raise NotImplementedError("Pruners must overload calc_mask()") + wrapper : PrunerLayerWrapper + The instrumented layer. + **kwargs + Reserved for forward compatibility. - def _instrument_layer(self, layer, config): - """ - Create a wrapper forward function to replace the original one. - - Parameters - ---------- - layer : LayerInfo - the layer to instrument the mask - config : dict - the configuration for generating the mask + Returns + ------- + dict of (str, tf.Tensor), or None + The key is weight ``Variable``'s name. The value is a mask ``Tensor`` of weight's shape and dtype. + If a weight's key does not appear in the return value, that weight will not be pruned. + Returning ``None`` means the mask is not changed since last time. + Weight names are globally unique, e.g. `model/conv_1/kernel:0`. """ - layer._call = layer.keras_layer.call + # TODO: maybe it should be able to calc on weight-granularity, beside from layer-granularity + raise NotImplementedError("Pruners must overload calc_masks()") - def new_call(*inputs): - weights = [x.numpy() for x in layer.keras_layer.weights] - mask = self.calc_mask(layer, config) - weights[layer.weight_index] = weights[layer.weight_index] * mask - layer.keras_layer.set_weights(weights) - ret = layer._call(*inputs) - return ret + def _update_mask(self): + for wrapper_idx, wrapper in enumerate(self.wrappers): + masks = self.calc_masks(wrapper, wrapper_idx=wrapper_idx) + if masks is not None: + wrapper.masks = masks - layer.keras_layer.call = new_call -class Quantizer(Compressor): +class PrunerLayerWrapper(tf.keras.Model): """ - Abstract base TensorFlow quantizer + Instrumented TF layer. + + Wrappers will be passed to pruner's ``calc_masks`` API, + and the pruning algorithm should use wrapper's attributes to calculate masks. + + Once instrumented, underlying layer's weights will get **modified** by masks before forward pass. + + Attributes + ---------- + layer_info : LayerInfo + All static information of the original layer. + layer : tf.keras.layers.Layer + The original layer. + config : JSON object + Selected configuration. The format is detailed in tutorial. + pruner : Pruner + Bound pruner object. + masks : dict of (str, tf.Tensor) + Current masks. The key is weight's name and the value is mask tensor. + On initialization, `masks` is an empty dict, which means no weight is pruned. + Afterwards, `masks` is the last return value of ``Pruner.calc_masks``. + See ``Pruner.calc_masks`` for details. """ - - def quantize_weight(self, weight, config, op, op_type, op_name): - raise NotImplementedError("Quantizer must overload quantize_weight()") + def __init__(self, layer_info, pruner): + super().__init__() + self.layer_info = layer_info + self.layer = layer_info.layer + self.config = layer_info.config + self.pruner = pruner + self.masks = {} + _logger.info('Layer detected to compress: %s', self.layer.name) + + def call(self, *inputs): + new_weights = [] + for weight in self.layer.weights: + mask = self.masks.get(weight.name) + if mask is not None: + new_weights.append(tf.math.multiply(weight, mask).numpy()) + else: + new_weights.append(weight.numpy()) + self.layer.set_weights(new_weights) + return self.layer(*inputs) + + +# TODO: designed to replace `patch_optimizer` +#class PrunerCallback(tf.keras.callbacks.Callback): +# def __init__(self, pruner): +# super().__init__() +# self._pruner = pruner +# +# def on_train_batch_end(self, batch, logs=None): +# self._pruner.update_mask() + + +def _detect_layers_to_compress(model, config_list): + # Returns list of LayerInfo. + located_layers = _locate_layers(model) + ret = [] + for layer in model.layers: + config = _select_config(LayerInfo(layer), config_list) + if config is not None: + if id(layer) not in located_layers: + _logger.error('Failed to locate layer %s in model. The layer will not be compressed. ' + 'This is a bug in NNI, feel free to fire an issue.', layer.name) + continue + layer_info = located_layers[id(layer)] + layer_info.config = config + ret.append(layer_info) + return ret + +def _locate_layers(model, cur_path=[]): + # Find out how to access layers from model object. + # Returns dict of (layer's object ID, LayerInfo). + # This function is required because TF framework does not track layer's attribute name, + # and to my knowledge `Layer.name` is only useful for read-only access. + # `cur_path`s format is documented in `LayerInfo.path`. + # TODO: it can only find layers in `Model` and `list` for now. + ret = {} + + if isinstance(model, tf.keras.Model): + for key, value in model.__dict__.items(): + if isinstance(value, tf.keras.Model): + ret.update(_locate_layers(value, cur_path + [key])) + elif isinstance(value, list): + ret.update(_locate_layers(value, cur_path + [key])) + elif isinstance(value, tf.keras.layers.Layer): + ret[id(value)] = LayerInfo(value, cur_path + [key]) + + elif isinstance(model, list): + for i, item in enumerate(model): + if isinstance(item, tf.keras.Model): + ret.update(_locate_layers(item, cur_path + [i])) + elif isinstance(item, tf.keras.layers.Layer): + ret[id(item)] = LayerInfo(item, cur_path + [i]) + + else: + raise ValueError('Unexpected model type: {}'.format(type(model))) + return ret + +def _select_config(layer_info, config_list): + # Find the last matching config block for given layer. + # Returns None if the layer should not be compressed. + ret = None + for config in config_list: + if 'op_types' in config: + match = layer_info.type in config['op_types'] + match_default = 'default' in config['op_types'] and layer_info.type in default_layers.weighted_modules + if not match and not match_default: + continue + if 'op_names' in config and layer_info.name not in config['op_names']: + continue + ret = config + if ret is None or 'exclude' in ret: + return None + return ret + + +def _instrument_model(model, wrappers): + # Replace layers to wrappers + for wrapper in reversed(wrappers): + cur = model + for key in wrapper.layer_info.path[:-1]: + if isinstance(key, int): + cur = cur[key] + else: + cur = getattr(cur, key) + key = wrapper.layer_info.path[-1] + if isinstance(key, int): + cur[key] = wrapper + else: + setattr(cur, key, wrapper) diff --git a/src/sdk/pynni/nni/compression/tensorflow/default_layers.py b/src/sdk/pynni/nni/compression/tensorflow/default_layers.py index 2ecc46e3e3..0c729bd883 100644 --- a/src/sdk/pynni/nni/compression/tensorflow/default_layers.py +++ b/src/sdk/pynni/nni/compression/tensorflow/default_layers.py @@ -1,31 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from tensorflow import keras - -supported_layers = { - keras.layers.Conv1D: ('Conv1D', 0), - keras.layers.Conv2D: ('Conv2D', 0), - keras.layers.Conv2DTranspose: ('Conv2DTranspose', 0), - keras.layers.Conv3D: ('Conv3D', 0), - keras.layers.Conv3DTranspose: ('Conv3DTranspose', 0), - keras.layers.ConvLSTM2D: ('ConvLSTM2D', 0), - keras.layers.Dense: ('Dense', 0), - keras.layers.Embedding: ('Embedding', 0), - keras.layers.GRU: ('GRU', 0), - keras.layers.LSTM: ('LSTM', 0), -} - -default_layers = [x[0] for x in supported_layers.values()] - -def get_op_type(layer_type): - if layer_type in supported_layers: - return supported_layers[layer_type][0] - else: - return None - -def get_weight_index(op_type): - for k in supported_layers: - if supported_layers[k][0] == op_type: - return supported_layers[k][1] - return None +weighted_modules = [ + 'Conv1D', 'Conv2D', 'Conv3D', 'Conv1DTranspose', 'Conv2DTranspose', 'Conv3DTranspose', + 'Dense', + 'PReLU', + 'Embedding', +] diff --git a/src/sdk/pynni/nni/compression/tensorflow/pruning/__init__.py b/src/sdk/pynni/nni/compression/tensorflow/pruning/__init__.py new file mode 100644 index 0000000000..f8ac8ea9b9 --- /dev/null +++ b/src/sdk/pynni/nni/compression/tensorflow/pruning/__init__.py @@ -0,0 +1 @@ +from .one_shot import * diff --git a/src/sdk/pynni/nni/compression/tensorflow/pruning/one_shot.py b/src/sdk/pynni/nni/compression/tensorflow/pruning/one_shot.py new file mode 100644 index 0000000000..ace3d39e4e --- /dev/null +++ b/src/sdk/pynni/nni/compression/tensorflow/pruning/one_shot.py @@ -0,0 +1,67 @@ +import tensorflow as tf + +from ..compressor import Pruner + +__all__ = [ + 'OneshotPruner', + 'LevelPruner', +] + +class OneshotPruner(Pruner): + def __init__(self, model, config_list, pruning_algorithm='level', **algo_kwargs): + super().__init__(model, config_list) + self.set_wrappers_attribute('calculated', False) + self.masker = MASKER_DICT[pruning_algorithm](model, self, **algo_kwargs) + + def validate_config(self, model, config_list): + pass # TODO + + def calc_masks(self, wrapper, wrapper_idx=None): + if wrapper.calculated: + return None + sparsity = wrapper.config['sparsity'] + masks = self.masker.calc_masks(sparsity, wrapper, wrapper_idx) + if masks is not None: + wrapper.calculated = True + return masks + + +class LevelPruner(OneshotPruner): + def __init__(self, model, config_list): + super().__init__(model, config_list, pruning_algorithm='level') + + +class WeightMasker: + def __init__(self, model, pruner, **kwargs): + self.model = model + self.pruner = pruner + + def calc_masks(self, sparsity, wrapper, wrapper_idx=None): + raise NotImplementedError() + + +class LevelPrunerMasker(WeightMasker): + def calc_masks(self, sparsity, wrapper, wrapper_idx=None): + masks = {} + for weight_variable in wrapper.layer.weights: + if weight_variable.name == 'bias': + continue + + k = int(tf.size(weight_variable).numpy() * sparsity) + if k == 0: + continue + + weight = weight_variable.read_value() + if wrapper.masks.get(weight_variable.name) is not None: + weight = tf.math.multiply(weight, wrapper.masks[weight_variable.name]) + + w_abs = tf.math.abs(tf.reshape(weight, [-1])) + threshold = tf.math.top_k(w_abs, k)[0][0] + mask = tf.math.greater(w_abs, threshold) + masks[weight_variable.name] = tf.cast(mask, weight.dtype) + return masks + + +MASKER_DICT = { + 'level': LevelPrunerMasker, +} diff --git a/src/sdk/pynni/nni/compression/torch/compressor.py b/src/sdk/pynni/nni/compression/torch/compressor.py index 51880ece12..e5e3017a07 100644 --- a/src/sdk/pynni/nni/compression/torch/compressor.py +++ b/src/sdk/pynni/nni/compression/torch/compressor.py @@ -54,20 +54,34 @@ def __init__(self, model, config_list, optimizer=None): self._fwd_hook_handles = {} self._fwd_hook_id = 0 - for layer, config in self._detect_modules_to_compress(): - wrapper = self._wrap_modules(layer, config) - self.modules_wrapper.append(wrapper) + self.reset() + if not self.modules_wrapper: _logger.warning('Nothing is configured to compress, please check your model and config_list') - self._wrap_model() - def validate_config(self, model, config_list): """ subclass can optionally implement this method to check if config_list if valid """ pass + def reset(self, checkpoint=None): + """ + reset model state dict and model wrapper + """ + self._unwrap_model() + if checkpoint is not None: + self.bound_model.load_state_dict(checkpoint) + + self.modules_to_compress = None + self.modules_wrapper = [] + + for layer, config in self._detect_modules_to_compress(): + wrapper = self._wrap_modules(layer, config) + self.modules_wrapper.append(wrapper) + + self._wrap_model() + def _detect_modules_to_compress(self): """ detect all modules should be compressed, and save the result in `self.modules_to_compress`. @@ -346,7 +360,7 @@ def _wrap_modules(self, layer, config): config : dict the configuration for generating the mask """ - _logger.info("Module detected to compress : %s.", layer.name) + _logger.debug("Module detected to compress : %s.", layer.name) wrapper = PrunerModuleWrapper(layer.module, layer.name, layer.type, config, self) assert hasattr(layer.module, 'weight'), "module %s does not have 'weight' attribute" % layer.name # move newly registered buffers to the same device of weight @@ -381,7 +395,7 @@ def export_model(self, model_path, mask_path=None, onnx_path=None, input_shape=N if weight_mask is not None: mask_sum = weight_mask.sum().item() mask_num = weight_mask.numel() - _logger.info('Layer: %s Sparsity: %.4f', wrapper.name, 1 - mask_sum / mask_num) + _logger.debug('Layer: %s Sparsity: %.4f', wrapper.name, 1 - mask_sum / mask_num) wrapper.module.weight.data = wrapper.module.weight.data.mul(weight_mask) if bias_mask is not None: wrapper.module.bias.data = wrapper.module.bias.data.mul(bias_mask) diff --git a/src/sdk/pynni/nni/compression/torch/pruning/__init__.py b/src/sdk/pynni/nni/compression/torch/pruning/__init__.py index a6977d634e..9787ba5291 100644 --- a/src/sdk/pynni/nni/compression/torch/pruning/__init__.py +++ b/src/sdk/pynni/nni/compression/torch/pruning/__init__.py @@ -11,3 +11,6 @@ from .net_adapt_pruner import NetAdaptPruner from .admm_pruner import ADMMPruner from .auto_compress_pruner import AutoCompressPruner +from .sensitivity_pruner import SensitivityPruner +from .amc import AMCPruner + diff --git a/src/sdk/pynni/nni/compression/torch/pruning/amc/__init__.py b/src/sdk/pynni/nni/compression/torch/pruning/amc/__init__.py new file mode 100644 index 0000000000..3c89a879c6 --- /dev/null +++ b/src/sdk/pynni/nni/compression/torch/pruning/amc/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from .amc_pruner import AMCPruner diff --git a/src/sdk/pynni/nni/compression/torch/pruning/amc/amc_pruner.py b/src/sdk/pynni/nni/compression/torch/pruning/amc/amc_pruner.py new file mode 100644 index 0000000000..2852fe6266 --- /dev/null +++ b/src/sdk/pynni/nni/compression/torch/pruning/amc/amc_pruner.py @@ -0,0 +1,328 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +from copy import deepcopy +from argparse import Namespace +import numpy as np +import torch +from torch.utils.tensorboard import SummaryWriter + +from nni.compression.torch.compressor import Pruner +from .channel_pruning_env import ChannelPruningEnv +from .lib.agent import DDPG +from .lib.utils import get_output_folder + +torch.backends.cudnn.deterministic = True + +class AMCPruner(Pruner): + """ + A pytorch implementation of AMC: AutoML for Model Compression and Acceleration on Mobile Devices. + (https://arxiv.org/pdf/1802.03494.pdf) + + Parameters: + model: nn.Module + The model to be pruned. + config_list: list + Configuration list to configure layer pruning. + Supported keys: + - op_types: operation type to be pruned + - op_names: operation name to be pruned + evaluator: function + function to evaluate the pruned model. + The prototype of the function: + >>> def evaluator(val_loader, model): + >>> ... + >>> return acc + val_loader: torch.utils.data.DataLoader + Data loader of validation dataset. + suffix: str + suffix to help you remember what experiment you ran. Default: None. + job: str + train_export: search best pruned model and export after search. + export_only: export a searched model, searched_model_path and export_path must be specified. + searched_model_path: str + when job == export_only, use searched_model_path to specify the path of the searched model. + export_path: str + path for exporting models + + # parameters for pruning environment + model_type: str + model type to prune, currently 'mobilenet' and 'mobilenetv2' are supported. Default: mobilenet + flops_ratio: float + preserve flops ratio. Default: 0.5 + lbound: float + minimum weight preserve ratio for each layer. Default: 0.2 + rbound: float + maximum weight preserve ratio for each layer. Default: 1.0 + reward: function + reward function type: + - acc_reward: accuracy * 0.01 + - acc_flops_reward: - (100 - accuracy) * 0.01 * np.log(flops) + Default: acc_reward + # parameters for channel pruning + n_calibration_batches: int + number of batches to extract layer information. Default: 60 + n_points_per_layer: int + number of feature points per layer. Default: 10 + channel_round: int + round channel to multiple of channel_round. Default: 8 + + # parameters for ddpg agent + hidden1: int + hidden num of first fully connect layer. Default: 300 + hidden2: int + hidden num of second fully connect layer. Default: 300 + lr_c: float + learning rate for critic. Default: 1e-3 + lr_a: float + learning rate for actor. Default: 1e-4 + warmup: int + number of episodes without training but only filling the replay memory. During warmup episodes, + random actions ares used for pruning. Default: 100 + discount: float + next Q value discount for deep Q value target. Default: 0.99 + bsize: int + minibatch size for training DDPG agent. Default: 64 + rmsize: int + memory size for each layer. Default: 100 + window_length: int + replay buffer window length. Default: 1 + tau: float + moving average for target network being used by soft_update. Default: 0.99 + # noise + init_delta: float + initial variance of truncated normal distribution + delta_decay: float + delta decay during exploration + + # parameters for training ddpg agent + max_episode_length: int + maximum episode length + output_dir: str + output directory to save log files and model files. Default: ./logs + debug: boolean + debug mode + train_episode: int + train iters each timestep. Default: 800 + epsilon: int + linear decay of exploration policy. Default: 50000 + seed: int + random seed to set for reproduce experiment. Default: None + """ + + def __init__( + self, + model, + config_list, + evaluator, + val_loader, + suffix=None, + job='train_export', + export_path=None, + searched_model_path=None, + model_type='mobilenet', + dataset='cifar10', + flops_ratio=0.5, + lbound=0.2, + rbound=1., + reward='acc_reward', + n_calibration_batches=60, + n_points_per_layer=10, + channel_round=8, + hidden1=300, + hidden2=300, + lr_c=1e-3, + lr_a=1e-4, + warmup=100, + discount=1., + bsize=64, + rmsize=100, + window_length=1, + tau=0.01, + init_delta=0.5, + delta_decay=0.99, + max_episode_length=1e9, + output_dir='./logs', + debug=False, + train_episode=800, + epsilon=50000, + seed=None): + + self.job = job + self.searched_model_path = searched_model_path + self.export_path = export_path + + if seed is not None: + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + + checkpoint = deepcopy(model.state_dict()) + + super().__init__(model, config_list, optimizer=None) + + # build folder and logs + base_folder_name = '{}_{}_r{}_search'.format(model_type, dataset, flops_ratio) + if suffix is not None: + base_folder_name = base_folder_name + '_' + suffix + self.output_dir = get_output_folder(output_dir, base_folder_name) + + if self.export_path is None: + self.export_path = os.path.join(self.output_dir, '{}_r{}_exported.pth'.format(model_type, flops_ratio)) + + self.env_args = Namespace( + model_type=model_type, + preserve_ratio=flops_ratio, + lbound=lbound, + rbound=rbound, + reward=reward, + n_calibration_batches=n_calibration_batches, + n_points_per_layer=n_points_per_layer, + channel_round=channel_round, + output=self.output_dir + ) + + self.env = ChannelPruningEnv( + self, evaluator, val_loader, checkpoint, args=self.env_args) + + if self.job == 'train_export': + print('=> Saving logs to {}'.format(self.output_dir)) + self.tfwriter = SummaryWriter(log_dir=self.output_dir) + self.text_writer = open(os.path.join(self.output_dir, 'log.txt'), 'w') + print('=> Output path: {}...'.format(self.output_dir)) + + nb_states = self.env.layer_embedding.shape[1] + nb_actions = 1 # just 1 action here + + rmsize = rmsize * len(self.env.prunable_idx) # for each layer + print('** Actual replay buffer size: {}'.format(rmsize)) + + self.ddpg_args = Namespace( + hidden1=hidden1, + hidden2=hidden2, + lr_c=lr_c, + lr_a=lr_a, + warmup=warmup, + discount=discount, + bsize=bsize, + rmsize=rmsize, + window_length=window_length, + tau=tau, + init_delta=init_delta, + delta_decay=delta_decay, + max_episode_length=max_episode_length, + debug=debug, + train_episode=train_episode, + epsilon=epsilon + ) + self.agent = DDPG(nb_states, nb_actions, self.ddpg_args) + + + def compress(self): + if self.job == 'train_export': + self.train(self.ddpg_args.train_episode, self.agent, self.env, self.output_dir) + self.export_pruned_model() + + def train(self, num_episode, agent, env, output_dir): + agent.is_training = True + step = episode = episode_steps = 0 + episode_reward = 0. + observation = None + T = [] # trajectory + while episode < num_episode: # counting based on episode + # reset if it is the start of episode + if observation is None: + observation = deepcopy(env.reset()) + agent.reset(observation) + + # agent pick action ... + if episode <= self.ddpg_args.warmup: + action = agent.random_action() + # action = sample_from_truncated_normal_distribution(lower=0., upper=1., mu=env.preserve_ratio, sigma=0.5) + else: + action = agent.select_action(observation, episode=episode) + + # env response with next_observation, reward, terminate_info + observation2, reward, done, info = env.step(action) + + T.append([reward, deepcopy(observation), deepcopy(observation2), action, done]) + + # fix-length, never reach here + # if max_episode_length and episode_steps >= max_episode_length - 1: + # done = True + + # [optional] save intermideate model + if num_episode / 3 <= 1 or episode % int(num_episode / 3) == 0: + agent.save_model(output_dir) + + # update + step += 1 + episode_steps += 1 + episode_reward += reward + observation = deepcopy(observation2) + + if done: # end of episode + print( + '#{}: episode_reward:{:.4f} acc: {:.4f}, ratio: {:.4f}'.format( + episode, episode_reward, + info['accuracy'], + info['compress_ratio'] + ) + ) + self.text_writer.write( + '#{}: episode_reward:{:.4f} acc: {:.4f}, ratio: {:.4f}\n'.format( + episode, episode_reward, + info['accuracy'], + info['compress_ratio'] + ) + ) + final_reward = T[-1][0] + # print('final_reward: {}'.format(final_reward)) + # agent observe and update policy + for _, s_t, s_t1, a_t, done in T: + agent.observe(final_reward, s_t, s_t1, a_t, done) + if episode > self.ddpg_args.warmup: + agent.update_policy() + + #agent.memory.append( + # observation, + # agent.select_action(observation, episode=episode), + # 0., False + #) + + # reset + observation = None + episode_steps = 0 + episode_reward = 0. + episode += 1 + T = [] + + self.tfwriter.add_scalar('reward/last', final_reward, episode) + self.tfwriter.add_scalar('reward/best', env.best_reward, episode) + self.tfwriter.add_scalar('info/accuracy', info['accuracy'], episode) + self.tfwriter.add_scalar('info/compress_ratio', info['compress_ratio'], episode) + self.tfwriter.add_text('info/best_policy', str(env.best_strategy), episode) + # record the preserve rate for each layer + for i, preserve_rate in enumerate(env.strategy): + self.tfwriter.add_scalar('preserve_rate/{}'.format(i), preserve_rate, episode) + + self.text_writer.write('best reward: {}\n'.format(env.best_reward)) + self.text_writer.write('best policy: {}\n'.format(env.best_strategy)) + self.text_writer.close() + + def export_pruned_model(self): + if self.searched_model_path is None: + wrapper_model_ckpt = os.path.join(self.output_dir, 'best_wrapped_model.pth') + else: + wrapper_model_ckpt = self.searched_model_path + self.env.reset() + self.bound_model.load_state_dict(torch.load(wrapper_model_ckpt)) + + print('validate searched model:', self.env._validate(self.env._val_loader, self.env.model)) + self.env.export_model() + self._unwrap_model() + print('validate exported model:', self.env._validate(self.env._val_loader, self.env.model)) + + torch.save(self.bound_model, self.export_path) + print('exported model saved to: {}'.format(self.export_path)) diff --git a/src/sdk/pynni/nni/compression/torch/pruning/amc/channel_pruning_env.py b/src/sdk/pynni/nni/compression/torch/pruning/amc/channel_pruning_env.py new file mode 100644 index 0000000000..fdd0694e1b --- /dev/null +++ b/src/sdk/pynni/nni/compression/torch/pruning/amc/channel_pruning_env.py @@ -0,0 +1,602 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import time +import math +import copy +import numpy as np +import torch +import torch.nn as nn + +from nni.compression.torch.compressor import PrunerModuleWrapper +from .lib.utils import prGreen +from .. import AMCWeightMasker + +# for pruning +def acc_reward(net, acc, flops): + return acc * 0.01 + + +def acc_flops_reward(net, acc, flops): + error = (100 - acc) * 0.01 + return -error * np.log(flops) + + +class ChannelPruningEnv: + """ + Env for channel pruning search. + This class is used to prune model using specified pruner. It prunes one layer when + step() is called. When the last layer is pruned, it evaluate the pruned model using + evaluator, and use the returned value of evaluator as reward of the episode. + + Usage: + env = ChannelPruningEnv(pruner, evaluator, val_loader, checkpoint, env_args) + episode = 0 + T = [] + while episode < num_episode: + action = agent.select_action(observation) + observation2, reward, done, info = env.step(action) + T.append([reward, deepcopy(observation), deepcopy(observation2), action, done]) + + if done: # end of episode, last layer pruned + episode += 1 + # train agent with episode data + for _, s_t, s_t1, a_t, done in T: + agent.observe(final_reward, s_t, s_t1, a_t, done) + agent.update_policy() + T = [] + + Attributes: + prunable_idx: layer indices for pruable layers, the index values are the index + of list(self.model.modules()). Pruable layers are pointwise Conv2d layers and Linear + layers. + buffer_idx: layer indices for buffer layers which refers the depthwise layers. + Each depthwise layer is always followd by a pointwise layer for both mobilenet and + mobilenetv2. The depthwise layer's filters are pruned when its next pointwise layer's + corresponding input channels are pruned. + shared_idx: layer indices for layers which share input. + For example: [[1,4], [8, 10, 15]] means layer 1 and 4 share same input, and layer + 8, 10 and 15 share another input. + layer_embedding: embeddings for each prunable layers, the embedding is used as + observation for DDPG agent. + layer_info_dict: flops and number of parameters of each layer. + min_strategy_dict: key is layer index, value is a tuple, the first value is the minimum + action of input channel, the second value is the minimum action value of output channel. + strategy_dict: key is layer index, value is a tuple, the first value is the action of input + channel, the second value is the action of output channel. + + Parameters: + pruner: Pruner + NNI Pruner instance used to prune model. + evaluator: function + function to evaluate the pruned model. + The prototype of the function: + >>> def evaluator(val_loader, model): + >>> ... + >>> return acc + val_loader: torch.utils.data.DataLoader + Data loader of validation dataset. + checkpoint: dict + checkpoint of the model to be pruned. It is used to reset model at beginning of each + episode. + args: + A Namespace object containing following arguments: + model_type: str + model type to prune, currently 'mobilenet' and 'mobilenetv2' are supported. + flops_ratio: float + preserve flops ratio. + lbound: float + minimum weight preserve ratio for each layer. + rbound: float + maximum weight preserve ratio for each layer. + reward: function + reward function type + + # parameters for channel pruning + n_calibration_batches: int + number of batches to extract layer information. + n_points_per_layer: int + number of feature points per layer. + channel_round: int + round channel to multiple of channel_round. + + """ + def __init__(self, pruner, evaluator, val_loader, checkpoint, args): + self.pruner = pruner + self.model = pruner.bound_model + self.checkpoint = checkpoint + self.batch_size = val_loader.batch_size + self.preserve_ratio = args.preserve_ratio + self.channel_prune_masker = AMCWeightMasker(self.model, self.pruner, args.channel_round) + + # options from args + self.args = args + self.lbound = args.lbound + self.rbound = args.rbound + + self.n_calibration_batches = args.n_calibration_batches + self.n_points_per_layer = args.n_points_per_layer + self.channel_round = args.channel_round + + # sanity check + assert self.preserve_ratio > self.lbound, 'Error! You can not achieve preserve_ratio smaller than lbound!' + + # prepare data + self._val_loader = val_loader + self._validate = evaluator + + # build indexs + self._build_index() + self.n_prunable_layer = len(self.prunable_idx) + + # extract information for preparing + self._extract_layer_information() + + # build embedding (static part) + self._build_state_embedding() + + # build reward + self.reset() # restore weight + self.org_acc = self._validate(self._val_loader, self.model) + print('=> original acc: {:.3f}%'.format(self.org_acc)) + self.org_model_size = sum(self.wsize_list) + print('=> original weight size: {:.4f} M param'.format(self.org_model_size * 1. / 1e6)) + self.org_flops = sum(self.flops_list) + print('=> FLOPs:') + print([self.layer_info_dict[idx]['flops']/1e6 for idx in sorted(self.layer_info_dict.keys())]) + print('=> original FLOPs: {:.4f} M'.format(self.org_flops * 1. / 1e6)) + + self.expected_preserve_computation = self.preserve_ratio * self.org_flops + + self.reward = eval(args.reward) + + self.best_reward = -math.inf + self.best_strategy = None + self.best_d_prime_list = None + self.best_masks = None + + self.org_w_size = sum(self.wsize_list) + + def step(self, action): + # Pseudo prune and get the corresponding statistics. The real pruning happens till the end of all pseudo pruning + if self.visited[self.cur_ind]: + action = self.strategy_dict[self.prunable_idx[self.cur_ind]][0] + preserve_idx = self.index_buffer[self.cur_ind] + else: + action = self._action_wall(action) # percentage to preserve + preserve_idx = None + # prune and update action + action, d_prime, preserve_idx = self.prune_kernel(self.prunable_idx[self.cur_ind], action, preserve_idx) + if not self.visited[self.cur_ind]: + for group in self.shared_idx: + if self.cur_ind in group: # set the shared ones + for g_idx in group: + self.strategy_dict[self.prunable_idx[g_idx]][0] = action + self.strategy_dict[self.prunable_idx[g_idx - 1]][1] = action + self.visited[g_idx] = True + self.index_buffer[g_idx] = preserve_idx.copy() + + self.strategy.append(action) # save action to strategy + self.d_prime_list.append(d_prime) + + self.strategy_dict[self.prunable_idx[self.cur_ind]][0] = action + if self.cur_ind > 0: + self.strategy_dict[self.prunable_idx[self.cur_ind - 1]][1] = action + + # all the actions are made + if self._is_final_layer(): + assert len(self.strategy) == len(self.prunable_idx) + current_flops = self._cur_flops() + acc_t1 = time.time() + acc = self._validate(self._val_loader, self.model) + acc_t2 = time.time() + self.val_time = acc_t2 - acc_t1 + compress_ratio = current_flops * 1. / self.org_flops + info_set = {'compress_ratio': compress_ratio, 'accuracy': acc, 'strategy': self.strategy.copy()} + reward = self.reward(self, acc, current_flops) + + if reward > self.best_reward: + self.best_reward = reward + self.best_strategy = self.strategy.copy() + self.best_d_prime_list = self.d_prime_list.copy() + torch.save(self.model.state_dict(), os.path.join(self.args.output, 'best_wrapped_model.pth')) + prGreen('New best reward: {:.4f}, acc: {:.4f}, compress: {:.4f}'.format(self.best_reward, acc, compress_ratio)) + prGreen('New best policy: {}'.format(self.best_strategy)) + prGreen('New best d primes: {}'.format(self.best_d_prime_list)) + obs = self.layer_embedding[self.cur_ind, :].copy() # actually the same as the last state + done = True + return obs, reward, done, info_set + + info_set = None + reward = 0 + done = False + self.visited[self.cur_ind] = True # set to visited + self.cur_ind += 1 # the index of next layer + # build next state (in-place modify) + self.layer_embedding[self.cur_ind][-3] = self._cur_reduced() * 1. / self.org_flops # reduced + self.layer_embedding[self.cur_ind][-2] = sum(self.flops_list[self.cur_ind + 1:]) * 1. / self.org_flops # rest + self.layer_embedding[self.cur_ind][-1] = self.strategy[-1] # last action + obs = self.layer_embedding[self.cur_ind, :].copy() + + return obs, reward, done, info_set + + def reset(self): + # restore env by loading the checkpoint + self.pruner.reset(self.checkpoint) + self.cur_ind = 0 + self.strategy = [] # pruning strategy + self.d_prime_list = [] + self.strategy_dict = copy.deepcopy(self.min_strategy_dict) + # reset layer embeddings + self.layer_embedding[:, -1] = 1. + self.layer_embedding[:, -2] = 0. + self.layer_embedding[:, -3] = 0. + obs = self.layer_embedding[0].copy() + obs[-2] = sum(self.wsize_list[1:]) * 1. / sum(self.wsize_list) + self.extract_time = 0 + self.fit_time = 0 + self.val_time = 0 + # for share index + self.visited = [False] * len(self.prunable_idx) + self.index_buffer = {} + return obs + + def set_export_path(self, path): + self.export_path = path + + def prune_kernel(self, op_idx, preserve_ratio, preserve_idx=None): + m_list = list(self.model.modules()) + op = m_list[op_idx] + assert (0. < preserve_ratio <= 1.) + assert type(op) == PrunerModuleWrapper + if preserve_ratio == 1: # do not prune + if (preserve_idx is None) or (len(preserve_idx) == op.module.weight.size(1)): + return 1., op.module.weight.size(1), None # should be a full index + op.input_feat = self.layer_info_dict[op_idx]['input_feat'] + op.output_feat = self.layer_info_dict[op_idx]['output_feat'] + + masks = self.channel_prune_masker.calc_mask(sparsity=1-preserve_ratio, wrapper=op, preserve_idx=preserve_idx) + m = masks['weight_mask'].cpu().data + if type(op.module) == nn.Conv2d: + d_prime = (m.sum((0, 2, 3)) > 0).sum().item() + preserve_idx = np.nonzero((m.sum((0, 2, 3)) > 0).numpy())[0] + else: + assert type(op.module) == nn.Linear + d_prime = (m.sum(1) > 0).sum().item() + preserve_idx = np.nonzero((m.sum(1) > 0).numpy())[0] + + op.weight_mask = masks['weight_mask'] + if hasattr(op.module, 'bias') and op.module.bias is not None and 'bias_mask' in masks: + op.bias_mask = masks['bias_mask'] + + action = (m == 1).sum().item() / m.numel() + return action, d_prime, preserve_idx + + def export_model(self): + while True: + self.export_layer(self.prunable_idx[self.cur_ind]) + if self._is_final_layer(): + break + self.cur_ind += 1 + + #TODO replace this speedup implementation with nni.compression.torch.ModelSpeedup + def export_layer(self, op_idx): + m_list = list(self.model.modules()) + op = m_list[op_idx] + assert type(op) == PrunerModuleWrapper + w = op.module.weight.cpu().data + m = op.weight_mask.cpu().data + if type(op.module) == nn.Linear: + w = w.unsqueeze(-1).unsqueeze(-1) + m = m.unsqueeze(-1).unsqueeze(-1) + + d_prime = (m.sum((0, 2, 3)) > 0).sum().item() + preserve_idx = np.nonzero((m.sum((0, 2, 3)) > 0).numpy())[0] + assert d_prime <= w.size(1) + + if d_prime == w.size(1): + return + + mask = np.zeros(w.size(1), bool) + mask[preserve_idx] = True + rec_weight = torch.zeros((w.size(0), d_prime, w.size(2), w.size(3))) + rec_weight = w[:, preserve_idx, :, :] + if type(op.module) == nn.Linear: + rec_weight = rec_weight.squeeze() + # no need to provide bias mask for channel pruning + rec_mask = torch.ones_like(rec_weight) + + # assign new weight and mask + device = op.module.weight.device + op.module.weight.data = rec_weight.to(device) + op.weight_mask = rec_mask.to(device) + if type(op.module) == nn.Conv2d: + op.module.in_channels = d_prime + else: + # Linear + op.module.in_features = d_prime + + # export prev layers + prev_idx = self.prunable_idx[self.prunable_idx.index(op_idx) - 1] + for idx in range(prev_idx, op_idx): + m = m_list[idx] + if type(m) == nn.Conv2d: # depthwise + m.weight.data = m.weight.data[mask, :, :, :] + if m.groups == m.in_channels: + m.groups = int(np.sum(mask)) + m.out_channels = d_prime + elif type(m) == nn.BatchNorm2d: + m.weight.data = m.weight.data[mask] + m.bias.data = m.bias.data[mask] + m.running_mean.data = m.running_mean.data[mask] + m.running_var.data = m.running_var.data[mask] + m.num_features = d_prime + + def _is_final_layer(self): + return self.cur_ind == len(self.prunable_idx) - 1 + + def _action_wall(self, action): + """ + Limit the action generated by DDPG for this layer by two constraints: + 1. The total flops must meet the flops reduce target. + For example: the original flops of entire model is 1000, target flops ratio is 0.5, target flops + is 1000*0.5 = 500. The reduced flops of other layers is 400, so the remaining flops quota is 500-400=100, + if the total original flops of this layer is 250, then the maximum ratio is 100/250 = 0.4. So the + action of this layer can not be greater than 0.4. + 2. The action must be greater than lbound which is stored in self.strategy_dict. + """ + assert len(self.strategy) == self.cur_ind + + action = float(action) + action = np.clip(action, 0, 1) + + other_comp = 0 + this_comp = 0 + for i, idx in enumerate(self.prunable_idx): + flop = self.layer_info_dict[idx]['flops'] + buffer_flop = self._get_buffer_flops(idx) + + if i == self.cur_ind - 1: # TODO: add other member in the set + this_comp += flop * self.strategy_dict[idx][0] + # add buffer (but not influenced by ratio) + other_comp += buffer_flop * self.strategy_dict[idx][0] + elif i == self.cur_ind: + this_comp += flop * self.strategy_dict[idx][1] + # also add buffer here (influenced by ratio) + this_comp += buffer_flop + else: + other_comp += flop * self.strategy_dict[idx][0] * self.strategy_dict[idx][1] + # add buffer + other_comp += buffer_flop * self.strategy_dict[idx][0] # only consider input reduction + + self.expected_min_preserve = other_comp + this_comp * action + max_preserve_ratio = (self.expected_preserve_computation - other_comp) * 1. / this_comp + + action = np.minimum(action, max_preserve_ratio) + action = np.maximum(action, self.strategy_dict[self.prunable_idx[self.cur_ind]][0]) # impossible (should be) + + return action + + def _get_buffer_flops(self, idx): + buffer_idx = self.buffer_dict[idx] + buffer_flop = sum([self.layer_info_dict[_]['flops'] for _ in buffer_idx]) + return buffer_flop + + def _cur_flops(self): + flops = 0 + for idx in self.prunable_idx: + c, n = self.strategy_dict[idx] # input, output pruning ratio + flops += self.layer_info_dict[idx]['flops'] * c * n + # add buffer computation + flops += self._get_buffer_flops(idx) * c # only related to input channel reduction + return flops + + def _cur_reduced(self): + # return the reduced weight + reduced = self.org_flops - self._cur_flops() + return reduced + + def _build_index(self): + """ + Build following information/data for later pruning: + self.prunable_idx: layer indices for pruable layers, the index values are the index + of list(self.model.modules()). Pruable layers are pointwise Conv2d layers and Linear + layers. + self.prunable_ops: prunable modules + self.buffer_idx: layer indices for buffer layers which refers the depthwise layers. + Each depthwise layer is always followd by a pointwise layer for both mobilenet and + mobilenetv2. The depthwise layer's filters are pruned when its next pointwise layer's + corresponding input channels are pruned. + self.shared_idx: layer indices for layers which share input. + For example: [[1,4], [8, 10, 15]] means layer 1 and 4 share same input, and layer + 8, 10 and 15 share another input. + self.org_channels: number of input channels for each layer + self.min_strategy_dict: key is layer index, value is a tuple, the first value is the minimum + action of input channel, the second value is the minimum action value of output channel. + self.strategy_dict: same as self.min_strategy_dict, but it will be updated later. + """ + self.prunable_idx = [] + self.prunable_ops = [] + self.layer_type_dict = {} + self.strategy_dict = {} + self.buffer_dict = {} + this_buffer_list = [] + self.org_channels = [] + # build index and the min strategy dict + for i, m in enumerate(self.model.modules()): + if isinstance(m, PrunerModuleWrapper): + m = m.module + if type(m) == nn.Conv2d and m.groups == m.in_channels: # depth-wise conv, buffer + this_buffer_list.append(i) + else: # really prunable + self.prunable_idx.append(i) + self.prunable_ops.append(m) + self.layer_type_dict[i] = type(m) + self.buffer_dict[i] = this_buffer_list + this_buffer_list = [] # empty + self.org_channels.append(m.in_channels if type(m) == nn.Conv2d else m.in_features) + + self.strategy_dict[i] = [self.lbound, self.lbound] + + self.strategy_dict[self.prunable_idx[0]][0] = 1 # modify the input + self.strategy_dict[self.prunable_idx[-1]][1] = 1 # modify the output + + self.shared_idx = [] + if self.args.model_type == 'mobilenetv2': # TODO: to be tested! Share index for residual connection + connected_idx = [4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32] # to be partitioned + last_ch = -1 + share_group = None + for c_idx in connected_idx: + if self.prunable_ops[c_idx].in_channels != last_ch: # new group + last_ch = self.prunable_ops[c_idx].in_channels + if share_group is not None: + self.shared_idx.append(share_group) + share_group = [c_idx] + else: # same group + share_group.append(c_idx) + self.shared_idx.append(share_group) + print('=> Conv layers to share channels: {}'.format(self.shared_idx)) + + self.min_strategy_dict = copy.deepcopy(self.strategy_dict) + + self.buffer_idx = [] + for _, v in self.buffer_dict.items(): + self.buffer_idx += v + + print('=> Prunable layer idx: {}'.format(self.prunable_idx)) + print('=> Buffer layer idx: {}'.format(self.buffer_idx)) + print('=> Shared idx: {}'.format(self.shared_idx)) + print('=> Initial min strategy dict: {}'.format(self.min_strategy_dict)) + + # added for supporting residual connections during pruning + self.visited = [False] * len(self.prunable_idx) + self.index_buffer = {} + + def _extract_layer_information(self): + m_list = list(self.model.modules()) + + self.data_saver = [] + self.layer_info_dict = dict() + self.wsize_list = [] + self.flops_list = [] + + from .lib.utils import measure_layer_for_pruning + + # extend the forward fn to record layer info + def new_forward(m): + def lambda_forward(x): + m.input_feat = x.clone() + #TODO replace this flops counter with nni.compression.torch.utils.counter.count_flops_params + measure_layer_for_pruning(m, x) + y = m.old_forward(x) + m.output_feat = y.clone() + return y + + return lambda_forward + + device = None + for idx in self.prunable_idx + self.buffer_idx: # get all + m = m_list[idx] + m.old_forward = m.forward + m.forward = new_forward(m) + if device is None and type(m) == PrunerModuleWrapper: + device = m.module.weight.device + + # now let the image flow + print('=> Extracting information...') + with torch.no_grad(): + for i_b, (inputs, target) in enumerate(self._val_loader): # use image from train set + if i_b == self.n_calibration_batches: + break + self.data_saver.append((inputs.clone(), target.clone())) + input_var = torch.autograd.Variable(inputs).to(device) + + # inference and collect stats + _ = self.model(input_var) + + if i_b == 0: # first batch + for idx in self.prunable_idx + self.buffer_idx: + self.layer_info_dict[idx] = dict() + self.layer_info_dict[idx]['params'] = m_list[idx].params + self.layer_info_dict[idx]['flops'] = m_list[idx].flops + self.wsize_list.append(m_list[idx].params) + self.flops_list.append(m_list[idx].flops) + print('flops:', self.flops_list) + for idx in self.prunable_idx: + f_in_np = m_list[idx].input_feat.data.cpu().numpy() + f_out_np = m_list[idx].output_feat.data.cpu().numpy() + if len(f_in_np.shape) == 4: # conv + if self.prunable_idx.index(idx) == 0: # first conv + f_in2save, f_out2save = None, None + elif m_list[idx].module.weight.size(3) > 1: # normal conv + f_in2save, f_out2save = f_in_np, f_out_np + else: # 1x1 conv + # assert f_out_np.shape[2] == f_in_np.shape[2] # now support k=3 + randx = np.random.randint(0, f_out_np.shape[2] - 0, self.n_points_per_layer) + randy = np.random.randint(0, f_out_np.shape[3] - 0, self.n_points_per_layer) + # input: [N, C, H, W] + self.layer_info_dict[idx][(i_b, 'randx')] = randx.copy() + self.layer_info_dict[idx][(i_b, 'randy')] = randy.copy() + + f_in2save = f_in_np[:, :, randx, randy].copy().transpose(0, 2, 1)\ + .reshape(self.batch_size * self.n_points_per_layer, -1) + + f_out2save = f_out_np[:, :, randx, randy].copy().transpose(0, 2, 1) \ + .reshape(self.batch_size * self.n_points_per_layer, -1) + else: + assert len(f_in_np.shape) == 2 + f_in2save = f_in_np.copy() + f_out2save = f_out_np.copy() + if 'input_feat' not in self.layer_info_dict[idx]: + self.layer_info_dict[idx]['input_feat'] = f_in2save + self.layer_info_dict[idx]['output_feat'] = f_out2save + else: + self.layer_info_dict[idx]['input_feat'] = np.vstack( + (self.layer_info_dict[idx]['input_feat'], f_in2save)) + self.layer_info_dict[idx]['output_feat'] = np.vstack( + (self.layer_info_dict[idx]['output_feat'], f_out2save)) + + def _build_state_embedding(self): + # build the static part of the state embedding + print('Building state embedding...') + layer_embedding = [] + module_list = list(self.model.modules()) + for i, ind in enumerate(self.prunable_idx): + m = module_list[ind].module + this_state = [] + if type(m) == nn.Conv2d: + this_state.append(i) # index + this_state.append(0) # layer type, 0 for conv + this_state.append(m.in_channels) # in channels + this_state.append(m.out_channels) # out channels + this_state.append(m.stride[0]) # stride + this_state.append(m.kernel_size[0]) # kernel size + this_state.append(np.prod(m.weight.size())) # weight size + elif type(m) == nn.Linear: + this_state.append(i) # index + this_state.append(1) # layer type, 1 for fc + this_state.append(m.in_features) # in channels + this_state.append(m.out_features) # out channels + this_state.append(0) # stride + this_state.append(1) # kernel size + this_state.append(np.prod(m.weight.size())) # weight size + + # this 3 features need to be changed later + this_state.append(0.) # reduced + this_state.append(0.) # rest + this_state.append(1.) # a_{t-1} + layer_embedding.append(np.array(this_state)) + + # normalize the state + layer_embedding = np.array(layer_embedding, 'float') + print('=> shape of embedding (n_layer * n_dim): {}'.format(layer_embedding.shape)) + assert len(layer_embedding.shape) == 2, layer_embedding.shape + for i in range(layer_embedding.shape[1]): + fmin = min(layer_embedding[:, i]) + fmax = max(layer_embedding[:, i]) + if fmax - fmin > 0: + layer_embedding[:, i] = (layer_embedding[:, i] - fmin) / (fmax - fmin) + + self.layer_embedding = layer_embedding + diff --git a/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/__init__.py b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/agent.py b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/agent.py new file mode 100644 index 0000000000..fe066301b8 --- /dev/null +++ b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/agent.py @@ -0,0 +1,232 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import numpy as np + +import torch +import torch.nn as nn +from torch.optim import Adam + +from .memory import SequentialMemory +from .utils import to_numpy, to_tensor + +criterion = nn.MSELoss() +USE_CUDA = torch.cuda.is_available() + + +class Actor(nn.Module): + def __init__(self, nb_states, nb_actions, hidden1=400, hidden2=300): + super(Actor, self).__init__() + self.fc1 = nn.Linear(nb_states, hidden1) + self.fc2 = nn.Linear(hidden1, hidden2) + self.fc3 = nn.Linear(hidden2, nb_actions) + self.relu = nn.ReLU() + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + out = self.fc1(x) + out = self.relu(out) + out = self.fc2(out) + out = self.relu(out) + out = self.fc3(out) + out = self.sigmoid(out) + return out + + +class Critic(nn.Module): + def __init__(self, nb_states, nb_actions, hidden1=400, hidden2=300): + super(Critic, self).__init__() + self.fc11 = nn.Linear(nb_states, hidden1) + self.fc12 = nn.Linear(nb_actions, hidden1) + self.fc2 = nn.Linear(hidden1, hidden2) + self.fc3 = nn.Linear(hidden2, 1) + self.relu = nn.ReLU() + + def forward(self, xs): + x, a = xs + out = self.fc11(x) + self.fc12(a) + out = self.relu(out) + out = self.fc2(out) + out = self.relu(out) + out = self.fc3(out) + return out + + +class DDPG(object): + def __init__(self, nb_states, nb_actions, args): + + self.nb_states = nb_states + self.nb_actions = nb_actions + + # Create Actor and Critic Network + net_cfg = { + 'hidden1': args.hidden1, + 'hidden2': args.hidden2, + # 'init_w': args.init_w + } + self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) + self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) + self.actor_optim = Adam(self.actor.parameters(), lr=args.lr_a) + + self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) + self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) + self.critic_optim = Adam(self.critic.parameters(), lr=args.lr_c) + + self.hard_update(self.actor_target, self.actor) # Make sure target is with the same weight + self.hard_update(self.critic_target, self.critic) + + # Create replay buffer + self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) + # self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, + # sigma=args.ou_sigma) + + # Hyper-parameters + self.batch_size = args.bsize + self.tau = args.tau + self.discount = args.discount + self.depsilon = 1.0 / args.epsilon + self.lbound = 0. # args.lbound + self.rbound = 1. # args.rbound + + # noise + self.init_delta = args.init_delta + self.delta_decay = args.delta_decay + self.warmup = args.warmup + + # + self.epsilon = 1.0 + # self.s_t = None # Most recent state + # self.a_t = None # Most recent action + self.is_training = True + + # + if USE_CUDA: self.cuda() + + # moving average baseline + self.moving_average = None + self.moving_alpha = 0.5 # based on batch, so small + + def update_policy(self): + # Sample batch + state_batch, action_batch, reward_batch, \ + next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) + + # normalize the reward + batch_mean_reward = np.mean(reward_batch) + if self.moving_average is None: + self.moving_average = batch_mean_reward + else: + self.moving_average += self.moving_alpha * (batch_mean_reward - self.moving_average) + reward_batch -= self.moving_average + # if reward_batch.std() > 0: + # reward_batch /= reward_batch.std() + + # Prepare for the target q batch + with torch.no_grad(): + next_q_values = self.critic_target([ + to_tensor(next_state_batch), + self.actor_target(to_tensor(next_state_batch)), + ]) + + target_q_batch = to_tensor(reward_batch) + \ + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values + + # Critic update + self.critic.zero_grad() + + q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) + + value_loss = criterion(q_batch, target_q_batch) + value_loss.backward() + self.critic_optim.step() + + # Actor update + self.actor.zero_grad() + + policy_loss = -self.critic([ # pylint: disable=all + to_tensor(state_batch), + self.actor(to_tensor(state_batch)) + ]) + + policy_loss = policy_loss.mean() + policy_loss.backward() + self.actor_optim.step() + + # Target update + self.soft_update(self.actor_target, self.actor) + self.soft_update(self.critic_target, self.critic) + + def eval(self): + self.actor.eval() + self.actor_target.eval() + self.critic.eval() + self.critic_target.eval() + + def cuda(self): + self.actor.cuda() + self.actor_target.cuda() + self.critic.cuda() + self.critic_target.cuda() + + def observe(self, r_t, s_t, s_t1, a_t, done): + if self.is_training: + self.memory.append(s_t, a_t, r_t, done) # save to memory + # self.s_t = s_t1 + + def random_action(self): + action = np.random.uniform(self.lbound, self.rbound, self.nb_actions) + # self.a_t = action + return action + + def select_action(self, s_t, episode): + # assert episode >= self.warmup, 'Episode: {} warmup: {}'.format(episode, self.warmup) + action = to_numpy(self.actor(to_tensor(np.array(s_t).reshape(1, -1)))).squeeze(0) + delta = self.init_delta * (self.delta_decay ** (episode - self.warmup)) + # action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() + action = self.sample_from_truncated_normal_distribution(lower=self.lbound, upper=self.rbound, mu=action, sigma=delta) + action = np.clip(action, self.lbound, self.rbound) + + # self.a_t = action + return action + + def reset(self, obs): + pass + # self.s_t = obs + # self.random_process.reset_states() + + def load_weights(self, output): + if output is None: return + + self.actor.load_state_dict( + torch.load('{}/actor.pkl'.format(output)) + ) + + self.critic.load_state_dict( + torch.load('{}/critic.pkl'.format(output)) + ) + + def save_model(self, output): + torch.save( + self.actor.state_dict(), + '{}/actor.pkl'.format(output) + ) + torch.save( + self.critic.state_dict(), + '{}/critic.pkl'.format(output) + ) + + def soft_update(self, target, source): + for target_param, param in zip(target.parameters(), source.parameters()): + target_param.data.copy_( + target_param.data * (1.0 - self.tau) + param.data * self.tau + ) + + def hard_update(self, target, source): + for target_param, param in zip(target.parameters(), source.parameters()): + target_param.data.copy_(param.data) + + def sample_from_truncated_normal_distribution(self, lower, upper, mu, sigma, size=1): + from scipy import stats + return stats.truncnorm.rvs((lower-mu)/sigma, (upper-mu)/sigma, loc=mu, scale=sigma, size=size) + + diff --git a/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/memory.py b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/memory.py new file mode 100644 index 0000000000..57bbcfceb8 --- /dev/null +++ b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/memory.py @@ -0,0 +1,227 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from __future__ import absolute_import +from collections import deque, namedtuple +import warnings +import random + +import numpy as np + +# [reference] https://github.com/matthiasplappert/keras-rl/blob/master/rl/memory.py + +# This is to be understood as a transition: Given `state0`, performing `action` +# yields `reward` and results in `state1`, which might be `terminal`. +Experience = namedtuple('Experience', 'state0, action, reward, state1, terminal1') + + +def sample_batch_indexes(low, high, size): + if high - low >= size: + # We have enough data. Draw without replacement, that is each index is unique in the + # batch. We cannot use `np.random.choice` here because it is horribly inefficient as + # the memory grows. See https://github.com/numpy/numpy/issues/2764 for a discussion. + # `random.sample` does the same thing (drawing without replacement) and is way faster. + r = range(low, high) + batch_idxs = random.sample(r, size) + else: + # Not enough data. Help ourselves with sampling from the range, but the same index + # can occur multiple times. This is not good and should be avoided by picking a + # large enough warm-up phase. + warnings.warn( + 'Not enough entries to sample without replacement. ' + 'Consider increasing your warm-up phase to avoid oversampling!') + batch_idxs = np.random.random_integers(low, high - 1, size=size) + assert len(batch_idxs) == size + return batch_idxs + + +class RingBuffer(object): + def __init__(self, maxlen): + self.maxlen = maxlen + self.start = 0 + self.length = 0 + self.data = [None for _ in range(maxlen)] + + def __len__(self): + return self.length + + def __getitem__(self, idx): + if idx < 0 or idx >= self.length: + raise KeyError() + return self.data[(self.start + idx) % self.maxlen] + + def append(self, v): + if self.length < self.maxlen: + # We have space, simply increase the length. + self.length += 1 + elif self.length == self.maxlen: + # No space, "remove" the first item. + self.start = (self.start + 1) % self.maxlen + else: + # This should never happen. + raise RuntimeError() + self.data[(self.start + self.length - 1) % self.maxlen] = v + + +def zeroed_observation(observation): + if hasattr(observation, 'shape'): + return np.zeros(observation.shape) + elif hasattr(observation, '__iter__'): + out = [] + for x in observation: + out.append(zeroed_observation(x)) + return out + else: + return 0. + + +class Memory(object): + def __init__(self, window_length, ignore_episode_boundaries=False): + self.window_length = window_length + self.ignore_episode_boundaries = ignore_episode_boundaries + + self.recent_observations = deque(maxlen=window_length) + self.recent_terminals = deque(maxlen=window_length) + + def sample(self, batch_size, batch_idxs=None): + raise NotImplementedError() + + def append(self, observation, action, reward, terminal, training=True): + self.recent_observations.append(observation) + self.recent_terminals.append(terminal) + + def get_recent_state(self, current_observation): + # This code is slightly complicated by the fact that subsequent observations might be + # from different episodes. We ensure that an experience never spans multiple episodes. + # This is probably not that important in practice but it seems cleaner. + state = [current_observation] + idx = len(self.recent_observations) - 1 + for offset in range(0, self.window_length - 1): + current_idx = idx - offset + current_terminal = self.recent_terminals[current_idx - 1] if current_idx - 1 >= 0 else False + if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal): + # The previously handled observation was terminal, don't add the current one. + # Otherwise we would leak into a different episode. + break + state.insert(0, self.recent_observations[current_idx]) + while len(state) < self.window_length: + state.insert(0, zeroed_observation(state[0])) + return state + + def get_config(self): + config = { + 'window_length': self.window_length, + 'ignore_episode_boundaries': self.ignore_episode_boundaries, + } + return config + + +class SequentialMemory(Memory): + def __init__(self, limit, **kwargs): + super(SequentialMemory, self).__init__(**kwargs) + + self.limit = limit + + # Do not use deque to implement the memory. This data structure may seem convenient but + # it is way too slow on random access. Instead, we use our own ring buffer implementation. + self.actions = RingBuffer(limit) + self.rewards = RingBuffer(limit) + self.terminals = RingBuffer(limit) + self.observations = RingBuffer(limit) + + def sample(self, batch_size, batch_idxs=None): + if batch_idxs is None: + # Draw random indexes such that we have at least a single entry before each + # index. + batch_idxs = sample_batch_indexes(0, self.nb_entries - 1, size=batch_size) + batch_idxs = np.array(batch_idxs) + 1 + assert np.min(batch_idxs) >= 1 + assert np.max(batch_idxs) < self.nb_entries + assert len(batch_idxs) == batch_size + + # Create experiences + experiences = [] + for idx in batch_idxs: + terminal0 = self.terminals[idx - 2] if idx >= 2 else False + while terminal0: + # Skip this transition because the environment was reset here. Select a new, random + # transition and use this instead. This may cause the batch to contain the same + # transition twice. + idx = sample_batch_indexes(1, self.nb_entries, size=1)[0] + terminal0 = self.terminals[idx - 2] if idx >= 2 else False + assert 1 <= idx < self.nb_entries + + # This code is slightly complicated by the fact that subsequent observations might be + # from different episodes. We ensure that an experience never spans multiple episodes. + # This is probably not that important in practice but it seems cleaner. + state0 = [self.observations[idx - 1]] + for offset in range(0, self.window_length - 1): + current_idx = idx - 2 - offset + current_terminal = self.terminals[current_idx - 1] if current_idx - 1 > 0 else False + if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal): + # The previously handled observation was terminal, don't add the current one. + # Otherwise we would leak into a different episode. + break + state0.insert(0, self.observations[current_idx]) + while len(state0) < self.window_length: + state0.insert(0, zeroed_observation(state0[0])) + action = self.actions[idx - 1] + reward = self.rewards[idx - 1] + terminal1 = self.terminals[idx - 1] + + # Okay, now we need to create the follow-up state. This is state0 shifted on timestep + # to the right. Again, we need to be careful to not include an observation from the next + # episode if the last state is terminal. + state1 = [np.copy(x) for x in state0[1:]] + state1.append(self.observations[idx]) + + assert len(state0) == self.window_length + assert len(state1) == len(state0) + experiences.append(Experience(state0=state0, action=action, reward=reward, + state1=state1, terminal1=terminal1)) + assert len(experiences) == batch_size + return experiences + + def sample_and_split(self, batch_size, batch_idxs=None): + experiences = self.sample(batch_size, batch_idxs) + + state0_batch = [] + reward_batch = [] + action_batch = [] + terminal1_batch = [] + state1_batch = [] + for e in experiences: + state0_batch.append(e.state0) + state1_batch.append(e.state1) + reward_batch.append(e.reward) + action_batch.append(e.action) + terminal1_batch.append(0. if e.terminal1 else 1.) + + # Prepare and validate parameters. + state0_batch = np.array(state0_batch, 'double').reshape(batch_size, -1) + state1_batch = np.array(state1_batch, 'double').reshape(batch_size, -1) + terminal1_batch = np.array(terminal1_batch, 'double').reshape(batch_size, -1) + reward_batch = np.array(reward_batch, 'double').reshape(batch_size, -1) + action_batch = np.array(action_batch, 'double').reshape(batch_size, -1) + + return state0_batch, action_batch, reward_batch, state1_batch, terminal1_batch + + def append(self, observation, action, reward, terminal, training=True): + super(SequentialMemory, self).append(observation, action, reward, terminal, training=training) + + # This needs to be understood as follows: in `observation`, take `action`, obtain `reward` + # and weather the next state is `terminal` or not. + if training: + self.observations.append(observation) + self.actions.append(action) + self.rewards.append(reward) + self.terminals.append(terminal) + + @property + def nb_entries(self): + return len(self.observations) + + def get_config(self): + config = super(SequentialMemory, self).get_config() + config['limit'] = self.limit + return config diff --git a/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/net_measure.py b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/net_measure.py new file mode 100644 index 0000000000..b9ba133431 --- /dev/null +++ b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/net_measure.py @@ -0,0 +1,123 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import torch + +# [reference] https://github.com/ShichenLiu/CondenseNet/blob/master/utils.py + + +def get_num_gen(gen): + return sum(1 for _ in gen) + + +def is_leaf(model): + return get_num_gen(model.children()) == 0 + + +def get_layer_info(layer): + layer_str = str(layer) + type_name = layer_str[:layer_str.find('(')].strip() + return type_name + + +def get_layer_param(model): + import operator + import functools + + return sum([functools.reduce(operator.mul, i.size(), 1) for i in model.parameters()]) + +count_ops = 0 +count_params = 0 + +def measure_layer(layer, x): + global count_ops, count_params + delta_ops = 0 + delta_params = 0 + multi_add = 1 + type_name = get_layer_info(layer) + + # ops_conv + if type_name in ['Conv2d']: + out_h = int((x.size()[2] + 2 * layer.padding[0] - layer.kernel_size[0]) / + layer.stride[0] + 1) + out_w = int((x.size()[3] + 2 * layer.padding[1] - layer.kernel_size[1]) / + layer.stride[1] + 1) + delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * \ + layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add + delta_params = get_layer_param(layer) + + # ops_nonlinearity + elif type_name in ['ReLU']: + delta_ops = x.numel() / x.size(0) + delta_params = get_layer_param(layer) + + # ops_pooling + elif type_name in ['AvgPool2d']: + in_w = x.size()[2] + kernel_ops = layer.kernel_size * layer.kernel_size + out_w = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1) + out_h = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1) + delta_ops = x.size()[1] * out_w * out_h * kernel_ops + delta_params = get_layer_param(layer) + + elif type_name in ['AdaptiveAvgPool2d']: + delta_ops = x.size()[1] * x.size()[2] * x.size()[3] + delta_params = get_layer_param(layer) + + # ops_linear + elif type_name in ['Linear']: + weight_ops = layer.weight.numel() * multi_add + bias_ops = layer.bias.numel() + delta_ops = weight_ops + bias_ops + delta_params = get_layer_param(layer) + + # ops_nothing + elif type_name in ['BatchNorm2d', 'Dropout2d', 'DropChannel', 'Dropout']: + delta_params = get_layer_param(layer) + + # unknown layer type + else: + delta_params = get_layer_param(layer) + + count_ops += delta_ops + count_params += delta_params + + return + + +def measure_model(model, H, W): + global count_ops, count_params + count_ops = 0 + count_params = 0 + data = torch.zeros(2, 3, H, W).cuda() + + def should_measure(x): + return is_leaf(x) + + def modify_forward(model): + for child in model.children(): + if should_measure(child): + def new_forward(m): + def lambda_forward(x): + measure_layer(m, x) + return m.old_forward(x) + return lambda_forward + child.old_forward = child.forward + child.forward = new_forward(child) + else: + modify_forward(child) + + def restore_forward(model): + for child in model.children(): + # leaf node + if is_leaf(child) and hasattr(child, 'old_forward'): + child.forward = child.old_forward + child.old_forward = None + else: + restore_forward(child) + + modify_forward(model) + model.forward(data) + restore_forward(model) + + return count_ops, count_params diff --git a/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/utils.py b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/utils.py new file mode 100644 index 0000000000..477efccb10 --- /dev/null +++ b/src/sdk/pynni/nni/compression/torch/pruning/amc/lib/utils.py @@ -0,0 +1,124 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import torch + +class TextLogger(object): + """Write log immediately to the disk""" + def __init__(self, filepath): + self.f = open(filepath, 'w') + self.fid = self.f.fileno() + self.filepath = filepath + + def close(self): + self.f.close() + + def write(self, content): + self.f.write(content) + self.f.flush() + os.fsync(self.fid) + + def write_buf(self, content): + self.f.write(content) + + def print_and_write(self, content): + print(content) + self.write(content+'\n') + +def to_numpy(var): + use_cuda = torch.cuda.is_available() + return var.cpu().data.numpy() if use_cuda else var.data.numpy() + + +def to_tensor(ndarray, requires_grad=False): # return a float tensor by default + tensor = torch.from_numpy(ndarray).float() # by default does not require grad + if requires_grad: + tensor.requires_grad_() + return tensor.cuda() if torch.cuda.is_available() else tensor + + +def measure_layer_for_pruning(wrapper, x): + def get_layer_type(layer): + layer_str = str(layer) + return layer_str[:layer_str.find('(')].strip() + + def get_layer_param(model): + import operator + import functools + + return sum([functools.reduce(operator.mul, i.size(), 1) for i in model.parameters()]) + + multi_add = 1 + layer = wrapper.module + type_name = get_layer_type(layer) + + # ops_conv + if type_name in ['Conv2d']: + out_h = int((x.size()[2] + 2 * layer.padding[0] - layer.kernel_size[0]) / + layer.stride[0] + 1) + out_w = int((x.size()[3] + 2 * layer.padding[1] - layer.kernel_size[1]) / + layer.stride[1] + 1) + wrapper.flops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * \ + layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add + wrapper.params = get_layer_param(layer) + # ops_linear + elif type_name in ['Linear']: + weight_ops = layer.weight.numel() * multi_add + bias_ops = layer.bias.numel() + wrapper.flops = weight_ops + bias_ops + wrapper.params = get_layer_param(layer) + return + + +def least_square_sklearn(X, Y): + from sklearn.linear_model import LinearRegression + reg = LinearRegression(fit_intercept=False) + reg.fit(X, Y) + return reg.coef_ + + +def get_output_folder(parent_dir, env_name): + """Return save folder. + Assumes folders in the parent_dir have suffix -run{run + number}. Finds the highest run number and sets the output folder + to that number + 1. This is just convenient so that if you run the + same script multiple times tensorboard can plot all of the results + on the same plots with different names. + Parameters + ---------- + parent_dir: str + Path of the directory containing all experiment runs. + Returns + ------- + parent_dir/run_dir + Path to this run's save directory. + """ + os.makedirs(parent_dir, exist_ok=True) + experiment_id = 0 + for folder_name in os.listdir(parent_dir): + if not os.path.isdir(os.path.join(parent_dir, folder_name)): + continue + try: + folder_name = int(folder_name.split('-run')[-1]) + if folder_name > experiment_id: + experiment_id = folder_name + except: + pass + experiment_id += 1 + + parent_dir = os.path.join(parent_dir, env_name) + parent_dir = parent_dir + '-run{}'.format(experiment_id) + os.makedirs(parent_dir, exist_ok=True) + return parent_dir + + +# logging +def prRed(prt): print("\033[91m {}\033[00m" .format(prt)) +def prGreen(prt): print("\033[92m {}\033[00m" .format(prt)) +def prYellow(prt): print("\033[93m {}\033[00m" .format(prt)) +def prLightPurple(prt): print("\033[94m {}\033[00m" .format(prt)) +def prPurple(prt): print("\033[95m {}\033[00m" .format(prt)) +def prCyan(prt): print("\033[96m {}\033[00m" .format(prt)) +def prLightGray(prt): print("\033[97m {}\033[00m" .format(prt)) +def prBlack(prt): print("\033[98m {}\033[00m" .format(prt)) diff --git a/src/sdk/pynni/nni/compression/torch/pruning/one_shot.py b/src/sdk/pynni/nni/compression/torch/pruning/one_shot.py index f74eba2a52..b58477a653 100644 --- a/src/sdk/pynni/nni/compression/torch/pruning/one_shot.py +++ b/src/sdk/pynni/nni/compression/torch/pruning/one_shot.py @@ -94,9 +94,11 @@ class LevelPruner(OneshotPruner): Supported keys: - sparsity : This is to specify the sparsity operations to be compressed to. - op_types : Operation types to prune. + optimizer: torch.optim.Optimizer + Optimizer used to train model """ - def __init__(self, model, config_list): - super().__init__(model, config_list, pruning_algorithm='level') + def __init__(self, model, config_list, optimizer=None): + super().__init__(model, config_list, pruning_algorithm='level', optimizer=optimizer) class SlimPruner(OneshotPruner): """ @@ -108,9 +110,11 @@ class SlimPruner(OneshotPruner): Supported keys: - sparsity : This is to specify the sparsity operations to be compressed to. - op_types : Only BatchNorm2d is supported in Slim Pruner. + optimizer: torch.optim.Optimizer + Optimizer used to train model """ - def __init__(self, model, config_list): - super().__init__(model, config_list, pruning_algorithm='slim') + def __init__(self, model, config_list, optimizer=None): + super().__init__(model, config_list, pruning_algorithm='slim', optimizer=optimizer) def validate_config(self, model, config_list): schema = CompressorSchema([{ @@ -147,9 +151,11 @@ class L1FilterPruner(_StructuredFilterPruner): Supported keys: - sparsity : This is to specify the sparsity operations to be compressed to. - op_types : Only Conv2d is supported in L1FilterPruner. + optimizer: torch.optim.Optimizer + Optimizer used to train model """ - def __init__(self, model, config_list): - super().__init__(model, config_list, pruning_algorithm='l1') + def __init__(self, model, config_list, optimizer=None): + super().__init__(model, config_list, pruning_algorithm='l1', optimizer=optimizer) class L2FilterPruner(_StructuredFilterPruner): """ @@ -161,9 +167,11 @@ class L2FilterPruner(_StructuredFilterPruner): Supported keys: - sparsity : This is to specify the sparsity operations to be compressed to. - op_types : Only Conv2d is supported in L2FilterPruner. + optimizer: torch.optim.Optimizer + Optimizer used to train model """ - def __init__(self, model, config_list): - super().__init__(model, config_list, pruning_algorithm='l2') + def __init__(self, model, config_list, optimizer=None): + super().__init__(model, config_list, pruning_algorithm='l2', optimizer=optimizer) class FPGMPruner(_StructuredFilterPruner): """ @@ -175,9 +183,11 @@ class FPGMPruner(_StructuredFilterPruner): Supported keys: - sparsity : This is to specify the sparsity operations to be compressed to. - op_types : Only Conv2d is supported in FPGM Pruner. + optimizer: torch.optim.Optimizer + Optimizer used to train model """ - def __init__(self, model, config_list): - super().__init__(model, config_list, pruning_algorithm='fpgm') + def __init__(self, model, config_list, optimizer=None): + super().__init__(model, config_list, pruning_algorithm='fpgm', optimizer=optimizer) class TaylorFOWeightFilterPruner(_StructuredFilterPruner): """ @@ -189,6 +199,8 @@ class TaylorFOWeightFilterPruner(_StructuredFilterPruner): Supported keys: - sparsity : How much percentage of convolutional filters are to be pruned. - op_types : Currently only Conv2d is supported in TaylorFOWeightFilterPruner. + optimizer: torch.optim.Optimizer + Optimizer used to train model """ def __init__(self, model, config_list, optimizer=None, statistics_batch_num=1): super().__init__(model, config_list, pruning_algorithm='taylorfo', optimizer=optimizer, statistics_batch_num=statistics_batch_num) @@ -203,6 +215,8 @@ class ActivationAPoZRankFilterPruner(_StructuredFilterPruner): Supported keys: - sparsity : How much percentage of convolutional filters are to be pruned. - op_types : Only Conv2d is supported in ActivationAPoZRankFilterPruner. + optimizer: torch.optim.Optimizer + Optimizer used to train model """ def __init__(self, model, config_list, optimizer=None, activation='relu', statistics_batch_num=1): super().__init__(model, config_list, pruning_algorithm='apoz', optimizer=optimizer, \ @@ -218,6 +232,8 @@ class ActivationMeanRankFilterPruner(_StructuredFilterPruner): Supported keys: - sparsity : How much percentage of convolutional filters are to be pruned. - op_types : Only Conv2d is supported in ActivationMeanRankFilterPruner. + optimizer: torch.optim.Optimizer + Optimizer used to train model """ def __init__(self, model, config_list, optimizer=None, activation='relu', statistics_batch_num=1): super().__init__(model, config_list, pruning_algorithm='mean_activation', optimizer=optimizer, \ diff --git a/src/sdk/pynni/nni/compression/torch/pruning/sensitivity_pruner.py b/src/sdk/pynni/nni/compression/torch/pruning/sensitivity_pruner.py new file mode 100644 index 0000000000..eee975c770 --- /dev/null +++ b/src/sdk/pynni/nni/compression/torch/pruning/sensitivity_pruner.py @@ -0,0 +1,397 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import os +import csv +import copy +import json +import logging +import torch + +from schema import And, Optional +from ..compressor import Pruner +from ..utils.config_validation import CompressorSchema +from .constants_pruner import PRUNER_DICT +from ..utils.sensitivity_analysis import SensitivityAnalysis + + +MAX_PRUNE_RATIO_PER_ITER = 0.95 + +_logger = logging.getLogger('Sensitivity_Pruner') + + +class SensitivityPruner(Pruner): + """ + This function prune the model based on the sensitivity + for each layer. + + Parameters + ---------- + model: torch.nn.Module + model to be compressed + evaluator: function + validation function for the model. This function should return the accuracy + of the validation dataset. The input parameters of evaluator can be specified + in the parameter `eval_args` and 'eval_kwargs' of the compress function if needed. + Example: + >>> def evaluator(model): + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> val_loader = ... + >>> model.eval() + >>> correct = 0 + >>> with torch.no_grad(): + >>> for data, target in val_loader: + >>> data, target = data.to(device), target.to(device) + >>> output = model(data) + >>> # get the index of the max log-probability + >>> pred = output.argmax(dim=1, keepdim=True) + >>> correct += pred.eq(target.view_as(pred)).sum().item() + >>> accuracy = correct / len(val_loader.dataset) + >>> return accuracy + finetuner: function + finetune function for the model. This parameter is not essential, if is not None, + the sensitivity pruner will finetune the model after pruning in each iteration. + The input parameters of finetuner can be specified in the parameter of compress + called `finetune_args` and `finetune_kwargs` if needed. + Example: + >>> def finetuner(model, epoch=3): + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> train_loader = ... + >>> criterion = torch.nn.CrossEntropyLoss() + >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + >>> model.train() + >>> for _ in range(epoch): + >>> for _, (data, target) in enumerate(train_loader): + >>> data, target = data.to(device), target.to(device) + >>> optimizer.zero_grad() + >>> output = model(data) + >>> loss = criterion(output, target) + >>> loss.backward() + >>> optimizer.step() + base_algo: str + base pruning algorithm. `level`, `l1` or `l2`, by default `l1`. + sparsity_proportion_calc: function + This function generate the sparsity proportion between the conv layers according to the + sensitivity analysis results. We provide a default function to quantify the sparsity + proportion according to the sensitivity analysis results. Users can also customize + this function according to their needs. The input of this function is a dict, + for example : {'conv1' : {0.1: 0.9, 0.2 : 0.8}, 'conv2' : {0.1: 0.9, 0.2 : 0.8}}, + in which, 'conv1' and is the name of the conv layer, and 0.1:0.9 means when the + sparsity of conv1 is 0.1 (10%), the model's val accuracy equals to 0.9. + sparsity_per_iter: float + The sparsity of the model that the pruner try to prune in each iteration. + acc_drop_threshold : float + The hyperparameter used to quantifiy the sensitivity for each layer. + checkpoint_dir: str + The dir path to save the checkpoints during the pruning. + """ + + def __init__(self, model, config_list, evaluator, + finetuner=None, base_algo='l1', sparsity_proportion_calc=None, + sparsity_per_iter=0.1, acc_drop_threshold=0.05, checkpoint_dir=None): + + self.base_algo = base_algo + self.model = model + super(SensitivityPruner, self).__init__(model, config_list) + # unwrap the model + self._unwrap_model() + _logger.debug(str(self.model)) + self.evaluator = evaluator + self.finetuner = finetuner + self.analyzer = SensitivityAnalysis( + self.model, self.evaluator, prune_type=base_algo, \ + early_stop_mode='dropped', early_stop_value=acc_drop_threshold) + # Get the original accuracy of the pretrained model + self.ori_acc = None + # Copy the original weights before pruning + self.ori_state_dict = copy.deepcopy(self.model.state_dict()) + self.sensitivities = {} + # Save the weight count for each layer + self.weight_count = {} + self.weight_sum = 0 + # Map the layer name to the layer module + self.named_module = {} + + self.Pruner = PRUNER_DICT[self.base_algo] + # Count the total weight count of the model + for name, submodule in self.model.named_modules(): + self.named_module[name] = submodule + if name in self.analyzer.target_layer: + # Currently, only count the weights in the conv layers + # else the fully connected layer (which contains + # the most weights) may make the pruner prune the + # model too hard + # if hasattr(submodule, 'weight'): # Count all the weights of the model + self.weight_count[name] = submodule.weight.data.numel() + self.weight_sum += self.weight_count[name] + # function to generate the sparsity proportion between the conv layers + if sparsity_proportion_calc is None: + self.sparsity_proportion_calc = self._max_prune_ratio + else: + self.sparsity_proportion_calc = sparsity_proportion_calc + # The ratio of remained weights is 1.0 at the begining + self.remained_ratio = 1.0 + self.sparsity_per_iter = sparsity_per_iter + self.acc_drop_threshold = acc_drop_threshold + self.checkpoint_dir = checkpoint_dir + + def validate_config(self, model, config_list): + """ + Parameters + ---------- + model : torch.nn.module + Model to be pruned + config_list : list + List on pruning configs + """ + + if self.base_algo == 'level': + schema = CompressorSchema([{ + 'sparsity': And(float, lambda n: 0 < n < 1), + Optional('op_types'): [str], + Optional('op_names'): [str], + }], model, _logger) + elif self.base_algo in ['l1', 'l2']: + schema = CompressorSchema([{ + 'sparsity': And(float, lambda n: 0 < n < 1), + 'op_types': ['Conv2d'], + Optional('op_names'): [str] + }], model, _logger) + + schema.validate(config_list) + + def load_sensitivity(self, filepath): + """ + load the sensitivity results exported by the sensitivity analyzer + """ + assert os.path.exists(filepath) + with open(filepath, 'r') as csvf: + csv_r = csv.reader(csvf) + header = next(csv_r) + sparsities = [float(x) for x in header[1:]] + sensitivities = {} + for row in csv_r: + layername = row[0] + accuracies = [float(x) for x in row[1:]] + sensitivities[layername] = {} + for i, accuracy in enumerate(accuracies): + sensitivities[layername][sparsities[i]] = accuracy + return sensitivities + + def _max_prune_ratio(self, ori_acc, threshold, sensitivities): + """ + Find the maximum prune ratio for a single layer whose accuracy + drop is lower than the threshold. + + Parameters + ---------- + ori_acc: float + Original accuracy + threshold: float + Accuracy drop threshold + sensitivities: dict + The dict object that stores the sensitivity results for each layer. + For example: {'conv1' : {0.1: 0.9, 0.2 : 0.8}} + Returns + ------- + max_ratios: dict + return the maximum prune ratio for each layer. For example: + {'conv1':0.1, 'conv2':0.2} + """ + max_ratio = {} + for layer in sensitivities: + prune_ratios = sorted(sensitivities[layer].keys()) + last_ratio = 0 + for ratio in prune_ratios: + cur_acc = sensitivities[layer][ratio] + if cur_acc + threshold < ori_acc: + break + last_ratio = ratio + max_ratio[layer] = last_ratio + return max_ratio + + def normalize(self, ratios, target_pruned): + """ + Normalize the prune ratio of each layer according to the + total already pruned ratio and the final target total pruning + ratio + + Parameters + ---------- + ratios: + Dict object that save the prune ratio for each layer + target_pruned: + The amount of the weights expected to be pruned in this + iteration + + Returns + ------- + new_ratios: + return the normalized prune ratios for each layer. + + """ + w_sum = 0 + _Max = 0 + for layername, ratio in ratios.items(): + wcount = self.weight_count[layername] + w_sum += ratio * wcount * \ + (1-self.analyzer.already_pruned[layername]) + target_count = self.weight_sum * target_pruned + for layername in ratios: + ratios[layername] = ratios[layername] * target_count / w_sum + _Max = max(_Max, ratios[layername]) + # Cannot Prune too much in a single iteration + # If a layer's prune ratio is larger than the + # MAX_PRUNE_RATIO_PER_ITER we rescal all prune + # ratios under this threshold + if _Max > MAX_PRUNE_RATIO_PER_ITER: + for layername in ratios: + ratios[layername] = ratios[layername] * \ + MAX_PRUNE_RATIO_PER_ITER / _Max + return ratios + + def create_cfg(self, ratios): + """ + Generate the cfg_list for the pruner according to the prune ratios. + + Parameters + --------- + ratios: + For example: {'conv1' : 0.2} + + Returns + ------- + cfg_list: + For example: [{'sparsity':0.2, 'op_names':['conv1'], 'op_types':['Conv2d']}] + """ + cfg_list = [] + for layername in ratios: + prune_ratio = ratios[layername] + remain = 1 - self.analyzer.already_pruned[layername] + sparsity = remain * prune_ratio + \ + self.analyzer.already_pruned[layername] + if sparsity > 0: + # Pruner does not allow the prune ratio to be zero + cfg = {'sparsity': sparsity, 'op_names': [ + layername], 'op_types': ['Conv2d']} + cfg_list.append(cfg) + return cfg_list + + def current_sparsity(self): + """ + The sparsity of the weight. + """ + pruned_weight = 0 + for layer_name in self.analyzer.already_pruned: + w_count = self.weight_count[layer_name] + prune_ratio = self.analyzer.already_pruned[layer_name] + pruned_weight += w_count * prune_ratio + return pruned_weight / self.weight_sum + + def compress(self, eval_args=None, eval_kwargs=None, + finetune_args=None, finetune_kwargs=None, resume_sensitivity=None): + """ + This function iteratively prune the model according to the results of + the sensitivity analysis. + + Parameters + ---------- + eval_args: list + eval_kwargs: list& dict + Parameters for the val_funtion, the val_function will be called like + evaluator(*eval_args, **eval_kwargs) + finetune_args: list + finetune_kwargs: dict + Parameters for the finetuner function if needed. + resume_sensitivity: + resume the sensitivity results from this file. + """ + # pylint suggest not use the empty list and dict + # as the default input parameter + if not eval_args: + eval_args = [] + if not eval_kwargs: + eval_kwargs = {} + if not finetune_args: + finetune_args = [] + if not finetune_kwargs: + finetune_kwargs = {} + if self.ori_acc is None: + self.ori_acc = self.evaluator(*eval_args, **eval_kwargs) + if not resume_sensitivity: + self.sensitivities = self.analyzer.analysis( + val_args=eval_args, val_kwargs=eval_kwargs) + else: + self.sensitivities = self.load_sensitivity(resume_sensitivity) + self.analyzer.sensitivities = self.sensitivities + # the final target sparsity of the model + target_ratio = 1 - self.config_list[0]['sparsity'] + cur_ratio = self.remained_ratio + ori_acc = self.ori_acc + iteration_count = 0 + if self.checkpoint_dir is not None: + os.makedirs(self.checkpoint_dir, exist_ok=True) + while cur_ratio > target_ratio: + iteration_count += 1 + # Each round have three steps: + # 1) Get the current sensitivity for each layer(the sensitivity + # of each layer may change during the pruning) + # 2) Prune each layer according the sensitivies + # 3) finetune the model + _logger.info('Current base accuracy %f', ori_acc) + _logger.info('Remained %f weights', cur_ratio) + # determine the sparsity proportion between different + # layers according to the sensitivity result + proportion = self.sparsity_proportion_calc( + ori_acc, self.acc_drop_threshold, self.sensitivities) + new_pruneratio = self.normalize(proportion, self.sparsity_per_iter) + cfg_list = self.create_cfg(new_pruneratio) + _logger.debug('Pruner Config: %s', str(cfg_list)) + pruner = self.Pruner(self.model, cfg_list) + pruner.compress() + pruned_acc = self.evaluator(*eval_args, **eval_kwargs) + _logger.info('Accuracy after pruning: %f', pruned_acc) + finetune_acc = pruned_acc + if self.finetuner is not None: + # if the finetune function is None, then skip the finetune + self.finetuner(*finetune_args, **finetune_kwargs) + finetune_acc = self.evaluator(*eval_args, **eval_kwargs) + _logger.info('Accuracy after finetune: %f', finetune_acc) + ori_acc = finetune_acc + # unwrap the pruner + pruner._unwrap_model() + # update the already prune ratio of each layer befor the new + # sensitivity analysis + for layer_cfg in cfg_list: + name = layer_cfg['op_names'][0] + sparsity = layer_cfg['sparsity'] + self.analyzer.already_pruned[name] = sparsity + # update the cur_ratio + cur_ratio = 1 - self.current_sparsity() + del pruner + _logger.info('Currently remained weights: %f', cur_ratio) + + if self.checkpoint_dir is not None: + checkpoint_name = 'Iter_%d_finetune_acc_%.5f_sparsity_%.4f' % ( + iteration_count, finetune_acc, cur_ratio) + checkpoint_path = os.path.join( + self.checkpoint_dir, '%s.pth' % checkpoint_name) + cfg_path = os.path.join( + self.checkpoint_dir, '%s_pruner.json' % checkpoint_name) + sensitivity_path = os.path.join( + self.checkpoint_dir, '%s_sensitivity.csv' % checkpoint_name) + torch.save(self.model.state_dict(), checkpoint_path) + with open(cfg_path, 'w') as jf: + json.dump(cfg_list, jf) + self.analyzer.export(sensitivity_path) + if cur_ratio > target_ratio: + # If this is the last prune iteration, skip the time-consuming + # sensitivity analysis + self.analyzer.load_state_dict(self.model.state_dict()) + self.sensitivities = self.analyzer.analysis( + val_args=eval_args, val_kwargs=eval_kwargs) + + _logger.info('After Pruning: %.2f weights remains', cur_ratio) + return self.model + + def calc_mask(self, wrapper, **kwargs): + return None diff --git a/src/sdk/pynni/nni/compression/torch/pruning/structured_pruning.py b/src/sdk/pynni/nni/compression/torch/pruning/structured_pruning.py index 8fb203452c..e1b3dc12ce 100644 --- a/src/sdk/pynni/nni/compression/torch/pruning/structured_pruning.py +++ b/src/sdk/pynni/nni/compression/torch/pruning/structured_pruning.py @@ -2,19 +2,40 @@ # Licensed under the MIT license. import logging +import math +import numpy as np import torch from .weight_masker import WeightMasker __all__ = ['L1FilterPrunerMasker', 'L2FilterPrunerMasker', 'FPGMPrunerMasker', \ 'TaylorFOWeightFilterPrunerMasker', 'ActivationAPoZRankFilterPrunerMasker', \ - 'ActivationMeanRankFilterPrunerMasker', 'SlimPrunerMasker'] + 'ActivationMeanRankFilterPrunerMasker', 'SlimPrunerMasker', 'AMCWeightMasker'] logger = logging.getLogger('torch filter pruners') class StructuredWeightMasker(WeightMasker): """ A structured pruning masker base class that prunes convolutional layer filters. + + Parameters + ---------- + model: nn.Module + model to be pruned + pruner: Pruner + A Pruner instance used to prune the model + preserve_round: int + after pruning, preserve filters/channels round to `preserve_round`, for example: + for a Conv2d layer, output channel is 32, sparsity is 0.2, if preserve_round is + 1 (no preserve round), then there will be int(32 * 0.2) = 6 filters pruned, and + 32 - 6 = 26 filters are preserved. If preserve_round is 4, preserved filters will + be round up to 28 (which can be divided by 4) and only 4 filters are pruned. + """ + def __init__(self, model, pruner, preserve_round=1): + self.model = model + self.pruner = pruner + self.preserve_round = preserve_round + def calc_mask(self, sparsity, wrapper, wrapper_idx=None): """ Calculate the mask of given layer. @@ -53,9 +74,16 @@ def calc_mask(self, sparsity, wrapper, wrapper_idx=None): mask_bias = None mask = {'weight_mask': mask_weight, 'bias_mask': mask_bias} - filters = weight.size(0) - num_prune = int(filters * sparsity) - if filters < 2 or num_prune < 1: + num_total = weight.size(0) + num_prune = int(num_total * sparsity) + if self.preserve_round > 1: + num_preserve = num_total - num_prune + num_preserve = int(math.ceil(num_preserve * 1. / self.preserve_round) * self.preserve_round) + if num_preserve > num_total: + num_preserve = int(math.floor(num_total * 1. / self.preserve_round) * self.preserve_round) + num_prune = num_total - num_preserve + + if num_total < 2 or num_prune < 1: return mask # weight*mask_weight: apply base mask for iterative pruning return self.get_mask(mask, weight*mask_weight, num_prune, wrapper, wrapper_idx) @@ -365,3 +393,135 @@ def calc_mask(self, sparsity, wrapper, wrapper_idx=None): mask_bias = mask_weight.clone() mask = {'weight_mask': mask_weight.detach(), 'bias_mask': mask_bias.detach()} return mask + +def least_square_sklearn(X, Y): + from sklearn.linear_model import LinearRegression + reg = LinearRegression(fit_intercept=False) + reg.fit(X, Y) + return reg.coef_ + +class AMCWeightMasker(WeightMasker): + """ + Weight maskser class for AMC pruner. Currently, AMCPruner only supports pruning kernel + size 1x1 pointwise Conv2d layer. Before using this class to prune kernels, AMCPruner + collected input and output feature maps for each layer, the features maps are flattened + and save into wrapper.input_feat and wrapper.output_feat. + + Parameters + ---------- + model: nn.Module + model to be pruned + pruner: Pruner + A Pruner instance used to prune the model + preserve_round: int + after pruning, preserve filters/channels round to `preserve_round`, for example: + for a Conv2d layer, output channel is 32, sparsity is 0.2, if preserve_round is + 1 (no preserve round), then there will be int(32 * 0.2) = 6 filters pruned, and + 32 - 6 = 26 filters are preserved. If preserve_round is 4, preserved filters will + be round up to 28 (which can be divided by 4) and only 4 filters are pruned. + """ + def __init__(self, model, pruner, preserve_round=1): + self.model = model + self.pruner = pruner + self.preserve_round = preserve_round + + def calc_mask(self, sparsity, wrapper, wrapper_idx=None, preserve_idx=None): + """ + Calculate the mask of given layer. + Parameters + ---------- + sparsity: float + pruning ratio, preserved weight ratio is `1 - sparsity` + wrapper: PrunerModuleWrapper + layer wrapper of this layer + wrapper_idx: int + index of this wrapper in pruner's all wrappers + Returns + ------- + dict + dictionary for storing masks, keys of the dict: + 'weight_mask': weight mask tensor + 'bias_mask': bias mask tensor (optional) + """ + msg = 'module type {} is not supported!'.format(wrapper.type) + assert wrapper.type in ['Conv2d', 'Linear'], msg + weight = wrapper.module.weight.data + bias = None + if hasattr(wrapper.module, 'bias') and wrapper.module.bias is not None: + bias = wrapper.module.bias.data + + if wrapper.weight_mask is None: + mask_weight = torch.ones(weight.size()).type_as(weight).detach() + else: + mask_weight = wrapper.weight_mask.clone() + if bias is not None: + if wrapper.bias_mask is None: + mask_bias = torch.ones(bias.size()).type_as(bias).detach() + else: + mask_bias = wrapper.bias_mask.clone() + else: + mask_bias = None + mask = {'weight_mask': mask_weight, 'bias_mask': mask_bias} + + num_total = weight.size(1) + num_prune = int(num_total * sparsity) + if self.preserve_round > 1: + num_preserve = num_total - num_prune + num_preserve = int(math.ceil(num_preserve * 1. / self.preserve_round) * self.preserve_round) + if num_preserve > num_total: + num_preserve = num_total + num_prune = num_total - num_preserve + + if (num_total < 2 or num_prune < 1) and preserve_idx is None: + return mask + + return self.get_mask(mask, weight, num_preserve, wrapper, wrapper_idx, preserve_idx) + + def get_mask(self, base_mask, weight, num_preserve, wrapper, wrapper_idx, preserve_idx): + w = weight.data.cpu().numpy() + if wrapper.type == 'Linear': + w = w[:, :, None, None] + + if preserve_idx is None: + importance = np.abs(w).sum((0, 2, 3)) + sorted_idx = np.argsort(-importance) # sum magnitude along C_in, sort descend + d_prime = num_preserve + preserve_idx = sorted_idx[:d_prime] # to preserve index + else: + d_prime = len(preserve_idx) + + assert len(preserve_idx) == d_prime + mask = np.zeros(w.shape[1], bool) + mask[preserve_idx] = True + + # reconstruct, X, Y <= [N, C] + X, Y = wrapper.input_feat, wrapper.output_feat + masked_X = X[:, mask] + if w.shape[2] == 1: # 1x1 conv or fc + rec_weight = least_square_sklearn(X=masked_X, Y=Y) + rec_weight = rec_weight.reshape(-1, 1, 1, d_prime) # (C_out, K_h, K_w, C_in') + rec_weight = np.transpose(rec_weight, (0, 3, 1, 2)) # (C_out, C_in', K_h, K_w) + else: + raise NotImplementedError('Current code only supports 1x1 conv now!') + rec_weight_pad = np.zeros_like(w) + # pylint: disable=all + rec_weight_pad[:, mask, :, :] = rec_weight + rec_weight = rec_weight_pad + + if wrapper.type == 'Linear': + rec_weight = rec_weight.squeeze() + assert len(rec_weight.shape) == 2 + + # now assign + wrapper.module.weight.data = torch.from_numpy(rec_weight).to(weight.device) + + mask_weight = torch.zeros_like(weight) + if wrapper.type == 'Linear': + mask_weight[:, preserve_idx] = 1. + if base_mask['bias_mask'] is not None and wrapper.module.bias is not None: + mask_bias = torch.ones_like(wrapper.module.bias) + else: + mask_weight[:, preserve_idx, :, :] = 1. + mask_bias = None + + return {'weight_mask': mask_weight.detach(), 'bias_mask': mask_bias} diff --git a/src/sdk/pynni/nni/compression/torch/speedup/compressor.py b/src/sdk/pynni/nni/compression/torch/speedup/compressor.py index b31acfe664..41753e1c9f 100644 --- a/src/sdk/pynni/nni/compression/torch/speedup/compressor.py +++ b/src/sdk/pynni/nni/compression/torch/speedup/compressor.py @@ -141,6 +141,14 @@ def infer_modules_masks(self): """ for module_name, mask in self.masks.items(): _logger.debug('Start mask inference from %s', module_name) + if module_name not in self.torch_graph.name_to_node: + # this module is not traced in the torch_graph, + # jit.trace only correctly records functions and + # modules which are not data dependent (e.g., do + # not have conditionals on data in tensors) + # so, if a node is not traced, we just skip it. + _logger.warning('%s has mask, but not found in the traced graph, just skip it.', module_name) + continue self.infer_module_mask(module_name, None, mask=mask) def replace_compressed_modules(self): diff --git a/src/sdk/pynni/nni/compression/torch/speedup/infer_shape.py b/src/sdk/pynni/nni/compression/torch/speedup/infer_shape.py index 47aa8087df..2635617031 100644 --- a/src/sdk/pynni/nni/compression/torch/speedup/infer_shape.py +++ b/src/sdk/pynni/nni/compression/torch/speedup/infer_shape.py @@ -222,6 +222,7 @@ def __repr__(self): 'ReLU': lambda module_masks, mask: relu_inshape(module_masks, mask), 'ReLU6': lambda module_masks, mask: relu_inshape(module_masks, mask), 'aten::relu': lambda module_masks, mask: relu_inshape(module_masks, mask), + 'aten::relu_': lambda module_masks, mask: relu_inshape(module_masks, mask), 'Conv2d': lambda module_masks, mask: conv2d_inshape(module_masks, mask), 'MaxPool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask), 'aten::max_pool2d': lambda module_masks, mask: maxpool2d_inshape(module_masks, mask), @@ -241,7 +242,8 @@ def __repr__(self): 'aten::cat': lambda module_mask, mask, cat_info, last_visited: cat_inshape(module_mask, mask, cat_info, last_visited), 'aten::mean': lambda module_masks, mask, shape: mean_inshape(module_masks, mask, shape), 'Dropout': lambda module_masks, mask: dropout_inshape(module_masks, mask), - 'Dropout2d': lambda module_masks, mask: dropout_inshape(module_masks, mask) + 'Dropout2d': lambda module_masks, mask: dropout_inshape(module_masks, mask), + 'aten::dropout': lambda module_masks, mask: dropout_inshape(module_masks, mask) } """ @@ -258,8 +260,14 @@ def dropout_inshape(module_masks, mask): return module_masks.output_mask # if alreay visited assert module_masks.input_mask <= mask - if module_masks.input_mask == mask: - return None + # It should be the same, we pass the masks by the reference(not the value), + # so they acutually are two references of the same object(mask, + # module_masks.input_mask). So we should continue pass the mask + # to the following nodes even module_masks.input_mask == mask. + # if pass the mask by copy.deepcopy(), then we can stop when + # module_masks.input_mask == mask. + # if module_masks.input_mask == mask: + # return None module_masks.set_input_mask(mask) module_masks.set_output_mask(mask) return module_masks.output_mask @@ -413,7 +421,8 @@ def linear_inshape(module_masks, mask): """ assert isinstance(mask, CoarseMask) assert mask.mask_index[0] is None - assert module_masks.input_mask is None + if module_masks.input_mask is not None: + assert module_masks.input_mask <= mask module_masks.set_input_mask(mask) return None @@ -451,7 +460,10 @@ def view_inshape(module_masks, mask, shape): assert mask.mask_index[0] is None assert mask.mask_index[2] is None assert mask.mask_index[3] is None - assert module_masks.input_mask is None + # due to the cat operation, the same node may be + # accessed more than once + if module_masks.input_mask is not None: + assert module_masks.input_mask <= mask module_masks.set_input_mask(mask) output_cmask = CoarseMask(num_dim=2) index = [] @@ -535,12 +547,9 @@ def relu_inshape(module_masks, mask): The mask of its output tensor """ assert isinstance(mask, CoarseMask) - # TODO: double check this assert, is it possible that a module is passed twice if module_masks.input_mask is not None: # check if has a mask conflict - assert module_masks.input_mask == mask - # No need to pass the mask again - return None + assert module_masks.input_mask <= mask # assert module_masks.input_mask is None, "A relu op can only be processed once" module_masks.set_input_mask(mask) module_masks.set_output_mask(mask) diff --git a/src/sdk/pynni/nni/compression/torch/utils/sensitivity_analysis.py b/src/sdk/pynni/nni/compression/torch/utils/sensitivity_analysis.py index fc259833b6..341c5ab67e 100644 --- a/src/sdk/pynni/nni/compression/torch/utils/sensitivity_analysis.py +++ b/src/sdk/pynni/nni/compression/torch/utils/sensitivity_analysis.py @@ -9,10 +9,7 @@ import numpy as np import torch.nn as nn -from nni.compression.torch import LevelPruner -from nni.compression.torch import L1FilterPruner -from nni.compression.torch import L2FilterPruner - +from ..pruning.constants_pruner import PRUNER_DICT SUPPORTED_OP_NAME = ['Conv2d', 'Conv1d'] SUPPORTED_OP_TYPE = [getattr(nn, name) for name in SUPPORTED_OP_NAME] @@ -77,11 +74,7 @@ def __init__(self, model, val_func, sparsities=None, prune_type='l1', early_stop else: self.sparsities = np.arange(0.1, 1.0, 0.1) self.sparsities = [np.round(x, 2) for x in self.sparsities] - self.Pruner = L1FilterPruner - if prune_type == 'l2': - self.Pruner = L2FilterPruner - elif prune_type == 'fine-grained': - self.Pruner = LevelPruner + self.Pruner = PRUNER_DICT[prune_type] self.early_stop_mode = early_stop_mode self.early_stop_value = early_stop_value self.ori_metric = None # original validation metric for the model diff --git a/src/sdk/pynni/nni/feature_engineering/gradient_selector/gradient_selector.py b/src/sdk/pynni/nni/feature_engineering/gradient_selector/gradient_selector.py index 9dea3e8a0e..f7cb69f627 100644 --- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/gradient_selector.py +++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/gradient_selector.py @@ -24,7 +24,7 @@ import pandas as pd from sklearn.base import BaseEstimator -from sklearn.feature_selection.base import SelectorMixin +from sklearn.feature_selection import SelectorMixin from sklearn.utils.validation import check_is_fitted import torch diff --git a/src/sdk/pynni/nni/feature_engineering/gradient_selector/requirements.txt b/src/sdk/pynni/nni/feature_engineering/gradient_selector/requirements.txt index 06d2241d5f..2aafc0c86f 100644 --- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/requirements.txt +++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/requirements.txt @@ -1,4 +1,4 @@ numpy==1.14.3 -scikit-learn==0.20.0 +scikit-learn>=0.23.2 scipy==1.1.0 torch==1.1.0 diff --git a/src/sdk/pynni/nni/metis_tuner/requirments.txt b/src/sdk/pynni/nni/metis_tuner/requirments.txt index 05c74c49ca..3dfc2232a1 100644 --- a/src/sdk/pynni/nni/metis_tuner/requirments.txt +++ b/src/sdk/pynni/nni/metis_tuner/requirments.txt @@ -1 +1 @@ -scikit-learn==0.20 \ No newline at end of file +scikit-learn>=0.23.2 diff --git a/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py b/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py index 2d03a3fbb8..a9645e9203 100644 --- a/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py +++ b/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py @@ -136,10 +136,10 @@ def validate_one_epoch(self, epoch): meters = AverageMeterGroup() for x, y in test_loader: self.mutator.reset() - logits = self.model(x) + logits = self.model(x, training=False) if isinstance(logits, tuple): logits, _ = logits - metrics = self.metrics(logits, y) + metrics = self.metrics(y, logits) loss = self.loss(y, logits) metrics['loss'] = tf.reduce_mean(loss).numpy() meters.update(metrics) @@ -151,8 +151,8 @@ def validate_one_epoch(self, epoch): def _create_train_loader(self): train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size) - test_set = self.test_set.shuffle(1000000).repeat().batch(self.batch_size) + test_set = self.valid_set.shuffle(1000000).repeat().batch(self.batch_size) return iter(train_set), iter(test_set) def _create_validate_loader(self): - return iter(self.test_set.shuffle(1000000).repeat().batch(self.batch_size)) + return iter(self.test_set.shuffle(1000000).batch(self.batch_size)) diff --git a/src/sdk/pynni/nni/nas/tensorflow/mutables.py b/src/sdk/pynni/nni/nas/tensorflow/mutables.py index b83b6f6325..06183a34c1 100644 --- a/src/sdk/pynni/nni/nas/tensorflow/mutables.py +++ b/src/sdk/pynni/nni/nas/tensorflow/mutables.py @@ -28,20 +28,19 @@ def __init__(self, key=None): def __deepcopy__(self, memodict=None): raise NotImplementedError("Deep copy doesn't work for mutables.") - def __call__(self, *args, **kwargs): - self._check_built() - return super().__call__(*args, **kwargs) - def set_mutator(self, mutator): - if 'mutator' in self.__dict__: + if hasattr(self, 'mutator'): raise RuntimeError('`set_mutator is called more than once. ' 'Did you parse the search space multiple times? ' 'Or did you apply multiple fixed architectures?') - self.__dict__['mutator'] = mutator + self.mutator = mutator def call(self, *inputs): raise NotImplementedError('Method `call` of Mutable must be overridden') + def build(self, input_shape): + self._check_built() + @property def key(self): return self._key @@ -68,7 +67,6 @@ def __repr__(self): class MutableScope(Mutable): def __call__(self, *args, **kwargs): try: - self._check_built() self.mutator.enter_mutable_scope(self) return super().__call__(*args, **kwargs) finally: @@ -80,7 +78,7 @@ def __init__(self, op_candidates, reduction='sum', return_mask=False, key=None): super().__init__(key=key) self.names = [] if isinstance(op_candidates, OrderedDict): - for name, _ in op_candidates.items(): + for name in op_candidates: assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \ "Please don't use a reserved name '{}' for your module.".format(name) self.names.append(name) @@ -94,21 +92,18 @@ def __init__(self, op_candidates, reduction='sum', return_mask=False, key=None): self.choices = op_candidates self.reduction = reduction self.return_mask = return_mask - self._built = False def call(self, *inputs): - if not self._built: - for op in self.choices: - if len(inputs) > 1: # FIXME: not tested - op.build([inp.shape for inp in inputs]) - elif len(inputs) == 1: - op.build(inputs[0].shape) - self._built = True out, mask = self.mutator.on_forward_layer_choice(self, *inputs) if self.return_mask: return out, mask return out + def build(self, input_shape): + self._check_built() + for op in self.choices: + op.build(input_shape) + def __len__(self): return len(self.choices) diff --git a/src/sdk/pynni/requirements.txt b/src/sdk/pynni/requirements.txt index 282f572631..ec1b8705fb 100644 --- a/src/sdk/pynni/requirements.txt +++ b/src/sdk/pynni/requirements.txt @@ -8,4 +8,4 @@ scipy hyperopt==0.1.2 # metis tuner -scikit-learn==0.20 +scikit-learn>=0.23.2 diff --git a/src/sdk/pynni/tests/models/pytorch_models/mobilenet.py b/src/sdk/pynni/tests/models/pytorch_models/mobilenet.py new file mode 100644 index 0000000000..8d60c90a4c --- /dev/null +++ b/src/sdk/pynni/tests/models/pytorch_models/mobilenet.py @@ -0,0 +1,83 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import torch.nn as nn +import math + + +def conv_bn(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU(inplace=True) + ) + + +def conv_dw(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm2d(inp), + nn.ReLU(inplace=True), + + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU(inplace=True), + ) + + +class MobileNet(nn.Module): + def __init__(self, n_class, profile='normal'): + super(MobileNet, self).__init__() + + # original + if profile == 'normal': + in_planes = 32 + cfg = [64, (128, 2), 128, (256, 2), 256, (512, 2), 512, 512, 512, 512, 512, (1024, 2), 1024] + # 0.5 AMC + elif profile == '0.5flops': + in_planes = 24 + cfg = [48, (96, 2), 80, (192, 2), 200, (328, 2), 352, 368, 360, 328, 400, (736, 2), 752] + else: + raise NotImplementedError + + self.conv1 = conv_bn(3, in_planes, stride=2) + + self.features = self._make_layers(in_planes, cfg, conv_dw) + + self.classifier = nn.Sequential( + nn.Linear(cfg[-1], n_class), + ) + + self._initialize_weights() + + def forward(self, x): + x = self.conv1(x) + x = self.features(x) + x = x.mean(3).mean(2) # global average pooling + + x = self.classifier(x) + return x + + def _make_layers(self, in_planes, cfg, layer): + layers = [] + for x in cfg: + out_planes = x if isinstance(x, int) else x[0] + stride = 1 if isinstance(x, int) else x[1] + layers.append(layer(in_planes, out_planes, stride)) + in_planes = out_planes + return nn.Sequential(*layers) + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() diff --git a/src/sdk/pynni/tests/test_compressor.py b/src/sdk/pynni/tests/test_compressor_torch.py similarity index 85% rename from src/sdk/pynni/tests/test_compressor.py rename to src/sdk/pynni/tests/test_compressor_torch.py index 6a8727c9e4..8d631da25a 100644 --- a/src/sdk/pynni/tests/test_compressor.py +++ b/src/sdk/pynni/tests/test_compressor_torch.py @@ -3,33 +3,12 @@ from unittest import TestCase, main import numpy as np -import tensorflow as tf import torch import torch.nn.functional as F import schema import nni.compression.torch as torch_compressor import math -if tf.__version__ >= '2.0': - import nni.compression.tensorflow as tf_compressor - - -def get_tf_model(): - model = tf.keras.models.Sequential([ - tf.keras.layers.Conv2D(filters=5, kernel_size=7, input_shape=[28, 28, 1], activation='relu', padding="SAME"), - tf.keras.layers.MaxPooling2D(pool_size=2), - tf.keras.layers.Conv2D(filters=10, kernel_size=3, activation='relu', padding="SAME"), - tf.keras.layers.MaxPooling2D(pool_size=2), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(units=128, activation='relu'), - tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(units=10, activation='softmax'), - ]) - model.compile(loss="sparse_categorical_crossentropy", - optimizer=tf.keras.optimizers.SGD(lr=1e-3), - metrics=["accuracy"]) - return model - class TorchModel(torch.nn.Module): def __init__(self): @@ -52,13 +31,6 @@ def forward(self, x): return F.log_softmax(x, dim=1) -def tf2(func): - def test_tf2_func(*args): - if tf.__version__ >= '2.0': - func(*args) - - return test_tf2_func - class CompressorTestCase(TestCase): def test_torch_quantizer_modules_detection(self): # test if modules can be detected @@ -88,13 +60,9 @@ def test_torch_quantizer_modules_detection(self): def test_torch_level_pruner(self): model = TorchModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) configure_list = [{'sparsity': 0.8, 'op_types': ['default']}] - torch_compressor.LevelPruner(model, configure_list).compress() - - @tf2 - def test_tf_level_pruner(self): - configure_list = [{'sparsity': 0.8, 'op_types': ['default']}] - tf_compressor.LevelPruner(get_tf_model(), configure_list).compress() + torch_compressor.LevelPruner(model, configure_list, optimizer).compress() def test_torch_naive_quantizer(self): model = TorchModel() @@ -107,10 +75,6 @@ def test_torch_naive_quantizer(self): }] torch_compressor.NaiveQuantizer(model, configure_list).compress() - @tf2 - def test_tf_naive_quantizer(self): - tf_compressor.NaiveQuantizer(get_tf_model(), [{'op_types': ['default']}]).compress() - def test_torch_fpgm_pruner(self): """ With filters(kernels) weights defined as above (w), it is obvious that w[4] and w[5] is the Geometric Median @@ -128,7 +92,7 @@ def test_torch_fpgm_pruner(self): model = TorchModel() config_list = [{'sparsity': 0.6, 'op_types': ['Conv2d']}, {'sparsity': 0.2, 'op_types': ['Conv2d']}] - pruner = torch_compressor.FPGMPruner(model, config_list) + pruner = torch_compressor.FPGMPruner(model, config_list, torch.optim.SGD(model.parameters(), lr=0.01)) model.conv2.module.weight.data = torch.tensor(w).float() masks = pruner.calc_mask(model.conv2) @@ -140,23 +104,7 @@ def test_torch_fpgm_pruner(self): masks = pruner.calc_mask(model.conv2) assert all(torch.sum(masks['weight_mask'], (1, 2, 3)).numpy() == np.array([125., 125., 0., 0., 0., 0., 0., 0., 125., 125.])) - @tf2 - def test_tf_fpgm_pruner(self): - w = np.array([np.ones((5, 3, 3)) * (i+1) for i in range(10)]).astype(np.float32) - model = get_tf_model() - config_list = [{'sparsity': 0.2, 'op_types': ['Conv2D']}] - - pruner = tf_compressor.FPGMPruner(model, config_list) - weights = model.layers[2].weights - weights[0] = np.array(w).astype(np.float32).transpose([2, 3, 0, 1]).transpose([0, 1, 3, 2]) - model.layers[2].set_weights([weights[0], weights[1].numpy()]) - - layer = tf_compressor.compressor.LayerInfo(model.layers[2]) - masks = pruner.calc_mask(layer, config_list[0]).numpy() - masks = masks.reshape((-1, masks.shape[-1])).transpose([1, 0]) - - assert all(masks.sum((1)) == np.array([45., 45., 45., 45., 0., 0., 45., 45., 45., 45.])) - + def test_torch_l1filter_pruner(self): """ Filters with the minimum sum of the weights' L1 norm are pruned in this paper: @@ -314,7 +262,7 @@ def test_torch_QAT_quantizer(self): def test_torch_pruner_validation(self): # test bad configuraiton pruner_classes = [torch_compressor.__dict__[x] for x in \ - ['LevelPruner', 'SlimPruner', 'FPGMPruner', 'L1FilterPruner', 'L2FilterPruner', \ + ['LevelPruner', 'SlimPruner', 'FPGMPruner', 'L1FilterPruner', 'L2FilterPruner', 'AGPPruner',\ 'ActivationMeanRankFilterPruner', 'ActivationAPoZRankFilterPruner']] bad_configs = [ @@ -336,10 +284,11 @@ def test_torch_pruner_validation(self): ] ] model = TorchModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) for pruner_class in pruner_classes: for config_list in bad_configs: try: - pruner_class(model, config_list) + pruner_class(model, config_list, optimizer) print(config_list) assert False, 'Validation error should be raised for bad configuration' except schema.SchemaError: diff --git a/src/sdk/pynni/tests/test_graph_utils.py b/src/sdk/pynni/tests/test_graph_utils.py index 92851bc91c..f6181d5482 100644 --- a/src/sdk/pynni/tests/test_graph_utils.py +++ b/src/sdk/pynni/tests/test_graph_utils.py @@ -15,7 +15,7 @@ import unittest from unittest import TestCase, main -from nni._graph_utils import build_module_graph, build_graph, TorchModuleGraph +from nni._graph_utils import build_module_graph, build_graph, TorchModuleGraph, TUPLE_UNPACK_KIND class BackboneModel1(nn.Module): def __init__(self): @@ -194,5 +194,101 @@ def forward(self, x): assert len(nodes) == 1 node = nodes[0] + @unittest.skipIf(torch.__version__ < "1.4.0", "not supported") + def test_module_unpack(self): + """ + test the tuple/list unpack function of TorchModuleGraph. + Following models are from the issue 2756 + https://github.com/microsoft/nni/issues/2756. + MyModule will have two successive tuple unpack operations + between the B and C. + """ + class CBR(nn.Module): + def __init__(self, i, o): + super(CBR, self).__init__() + self.conv1 = nn.Conv2d(i, o, kernel_size=1) + self.bn1 = nn.BatchNorm2d(o) + self.act1 = nn.ReLU() + + def forward(self, x): + return self.act1(self.bn1(self.conv1(x))) + + + class A(nn.Module): + def __init__(self): + super(A, self).__init__() + self.conv1 = CBR(3, 6, ) + self.conv2 = CBR(6, 8, ) + self.conv3 = CBR(6, 12) + + def forward(self, x): + x1 = self.conv1(x) + x2 = self.conv2(x1) + x3 = self.conv3(x1) + return (x2, x3) + + + class B1(nn.Module): + def __init__(self): + super(B1, self).__init__() + self.conv1 = CBR(12, 32) + self.conv2 = CBR(32, 32) + self.conv3 = CBR(32, 32) + + def forward(self, x): + x1 = self.conv1(x) + x2 = self.conv2(x1) + x3 = self.conv3(x2) + return (x1, x2, x3) + + class B(nn.Module): + def __init__(self): + super(B, self).__init__() + self.b = B1() + + def forward(self, x): + return self.b(x[-1]) + + class C(nn.Module): + def __init__(self): + super(C, self).__init__() + self.conv1 = CBR(8, 32) + self.conv2 = CBR(12, 32) + self.conv3 = CBR(32, 32) + self.conv4 = CBR(32, 32) + self.conv5 = CBR(32, 32) + + def forward(self, x): + return(self.conv1(x[0]), self.conv2(x[1]), self.conv3(x[2]),self.conv4(x[3]),self.conv5(x[4])) + + class MyModule(nn.Module): + def __init__(self): + super(MyModule, self).__init__() + self.a = A() + self.b = B() + # self.dummy = Dummy() + self.c = C() + + def forward(self, x): + x_a = self.a(x) + x_b = self.b(x_a) + xc = self.c(x_a + x_b) + return xc + + dummy_input = torch.rand(1, 3, 28, 28) + model = MyModule() + graph = TorchModuleGraph(model, dummy_input) + graph.unpack_manually() + for node in graph.nodes_py.nodes_op: + # The input of the function nodes should + # not come from the TupleUnpack node, because + # all the TupleUnpack nodes have been removed(unpacked) + # manually + for _input in node.inputs: + if _input in graph.output_to_node: + preprocessor = graph.output_to_node[_input] + assert preprocessor.op_type != TUPLE_UNPACK_KIND + + if __name__ == '__main__': main() diff --git a/src/sdk/pynni/tests/test_model_speedup.py b/src/sdk/pynni/tests/test_model_speedup.py index a06f991c97..845ed793ff 100644 --- a/src/sdk/pynni/tests/test_model_speedup.py +++ b/src/sdk/pynni/tests/test_model_speedup.py @@ -145,18 +145,18 @@ def test_speedup_bigmodel(self): assert model.backbone2.fc1.in_features == int(orig_model.backbone2.fc1.in_features * SPARSITY) def test_speedup_integration(self): - for model_name in ['resnet18', 'squeezenet1_1', 'mobilenet_v2']: + for model_name in ['resnet18', 'squeezenet1_1', 'mobilenet_v2', 'densenet121', 'inception_v3']: Model = getattr(models, model_name) net = Model(pretrained=True, progress=False).to(device) + speedup_model = Model().to(device) net.eval() # this line is necessary + speedup_model.eval() # random generate the prune config for the pruner cfgs = generate_random_sparsity(net) pruner = L1FilterPruner(net, cfgs) pruner.compress() pruner.export_model(MODEL_FILE, MASK_FILE) pruner._unwrap_model() - speedup_model = Model().to(device) - speedup_model.eval() state_dict = torch.load(MODEL_FILE) speedup_model.load_state_dict(state_dict) zero_bn_bias(net) diff --git a/src/sdk/pynni/tests/test_pruners.py b/src/sdk/pynni/tests/test_pruners.py index 1fab9b2b2a..7157d9bc08 100644 --- a/src/sdk/pynni/tests/test_pruners.py +++ b/src/sdk/pynni/tests/test_pruners.py @@ -5,11 +5,14 @@ import torch import torch.nn as nn import torch.nn.functional as F +import torch.utils.data import math from unittest import TestCase, main from nni.compression.torch import LevelPruner, SlimPruner, FPGMPruner, L1FilterPruner, \ L2FilterPruner, AGPPruner, ActivationMeanRankFilterPruner, ActivationAPoZRankFilterPruner, \ - TaylorFOWeightFilterPruner, NetAdaptPruner, SimulatedAnnealingPruner, ADMMPruner, AutoCompressPruner + TaylorFOWeightFilterPruner, NetAdaptPruner, SimulatedAnnealingPruner, ADMMPruner, \ + AutoCompressPruner, AMCPruner +from models.pytorch_models.mobilenet import MobileNet def validate_sparsity(wrapper, sparsity, bias=False): masks = [wrapper.weight_mask] @@ -154,6 +157,12 @@ def validate_sparsity(wrapper, sparsity, bias=False): 'evaluator': lambda model: 0.9, 'dummy_input': torch.randn([64, 1, 28, 28]), 'validators': [] + }, + 'amc': { + 'pruner_class': AMCPruner, + 'config_list':[{ + 'op_types': ['Conv2d', 'Linear'] + }] } } @@ -192,9 +201,7 @@ def pruners_test(pruner_names=['level', 'agp', 'slim', 'fpgm', 'l1', 'l2', 'tayl pruner = prune_config[pruner_name]['pruner_class'](model, config_list, trainer=prune_config[pruner_name]['trainer']) elif pruner_name == 'autocompress': pruner = prune_config[pruner_name]['pruner_class'](model, config_list, trainer=prune_config[pruner_name]['trainer'], evaluator=prune_config[pruner_name]['evaluator'], dummy_input=x) - elif pruner_name in ['level', 'slim', 'fpgm', 'l1', 'l2']: - pruner = prune_config[pruner_name]['pruner_class'](model, config_list) - elif pruner_name in ['agp', 'taylorfo', 'mean_activation', 'apoz']: + else: pruner = prune_config[pruner_name]['pruner_class'](model, config_list, optimizer) pruner.compress() @@ -246,6 +253,13 @@ def test_agp(pruning_algorithm): # set abs_tol = 0.2, considering the sparsity error for channel pruning when number of channels is small. assert math.isclose(actual_sparsity, target_sparsity, abs_tol=0.2) +class SimpleDataset: + def __getitem__(self, index): + return torch.randn(3, 32, 32), 1. + + def __len__(self): + return 1000 + class PrunerTestCase(TestCase): def test_pruners(self): pruners_test(bias=True) @@ -261,5 +275,15 @@ def test_agp_pruner(self): prune_config['agp']['config_list'][0]['op_types'] = ['default'] test_agp(pruning_algorithm) + def testAMC(self): + model = MobileNet(n_class=10) + + def validate(val_loader, model): + return 80. + val_loader = torch.utils.data.DataLoader(SimpleDataset(), batch_size=16, shuffle=False, drop_last=True) + config_list = prune_config['amc']['config_list'] + pruner = AMCPruner(model, config_list, validate, val_loader, train_episode=1) + pruner.compress() + if __name__ == '__main__': main() diff --git a/src/webui/mock/all-types-metric.json b/src/webui/mock/all-types-metric.json new file mode 100644 index 0000000000..ec6bc19457 --- /dev/null +++ b/src/webui/mock/all-types-metric.json @@ -0,0 +1,2527 @@ + +{ + "checkStatus": { + "status": "DONE", + "errors": [] + }, + "experiment": { + "id": "Tkaxm2mb", + "revision": 118, + "execDuration": 150, + "logDir": "/***/nni/experiments/Tkaxm2mb", + "nextSequenceId": 110, + "params": { + "authorName": "default", + "experimentName": "default", + "trialConcurrency": 10, + "maxExecDuration": 3600, + "maxTrialNum": 100, + "searchSpace": "{\"intermediate1\": {\"_type\": \"choice\", \"_value\": [\"normal\", \"inf\", \"neginf\", \"nan\", \"string\", \"dict-empty\", \"dict-normal\", \"dict-nodefault\", \"dict-defaultdict\"]}, \"intermediate2\": {\"_type\": \"choice\", \"_value\": [\"normal\", \"inf\", \"neginf\", \"nan\", \"string\", \"dict-empty\", \"dict-normal\", \"dict-nodefault\", \"dict-defaultdict\"]}, \"intermediate3\": {\"_type\": \"choice\", \"_value\": [\"normal\", \"inf\", \"neginf\", \"nan\", \"string\", \"dict-empty\", \"dict-normal\", \"dict-nodefault\", \"dict-defaultdict\"]}, \"intermediate_count\": {\"_type\": \"choice\", \"_value\": [0, 1, 2, 3]}, \"final1\": {\"_type\": \"choice\", \"_value\": [\"normal\", \"inf\", \"neginf\", \"nan\", \"string\", \"dict-empty\", \"dict-normal\", \"dict-nodefault\", \"dict-defaultdict\"]}, \"final2\": {\"_type\": \"choice\", \"_value\": [\"normal\", \"inf\", \"neginf\", \"nan\", \"string\", \"dict-empty\", \"dict-normal\", \"dict-nodefault\", \"dict-defaultdict\"]}, \"final_count\": {\"_type\": \"choice\", \"_value\": [0, 1, 2]}}", + "trainingServicePlatform": "local", + "tuner": { + "codeDir": "/***/nnidev/src/webui/tests/metrics-test/.", + "classFileName": "naive_random.py", + "className": "NaiveRandomTuner", + "checkpointDir": "/***/nni/experiments/Tkaxm2mb/checkpoint" + }, + "versionCheck": true, + "clusterMetaData": [ + { + "key": "codeDir", + "value": "/***/nnidev/src/webui/tests/metrics-test/." + }, + { + "key": "command", + "value": "python trial.py" + } + ] + }, + "startTime": 1595901129833, + "endTime": 1595901290657 + }, + "metricData": [ + { + "timestamp": 1595901141232, + "trialJobId": "sXvMz", + "parameterId": "0", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -7.823851251971656, \\\"other\\\": -9.844189628757352}\"" + }, + { + "timestamp": 1595901141321, + "trialJobId": "y3owq", + "parameterId": "1", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"\\\"-5.8373125018382055\\\"\"" + }, + { + "timestamp": 1595901141347, + "trialJobId": "etEUl", + "parameterId": "2", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901141374, + "trialJobId": "r5pwY", + "parameterId": "3", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901141455, + "trialJobId": "JxX0I", + "parameterId": "4", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901141543, + "trialJobId": "ywQvm", + "parameterId": "5", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -9.945796251990785}}\"" + }, + { + "timestamp": 1595901141643, + "trialJobId": "tkxcP", + "parameterId": "6", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901141708, + "trialJobId": "MjX3O", + "parameterId": "7", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901141754, + "trialJobId": "MQlPp", + "parameterId": "9", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -6.887609164015767}}\"" + }, + { + "timestamp": 1595901141756, + "trialJobId": "LKVCX", + "parameterId": "8", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901142236, + "trialJobId": "sXvMz", + "parameterId": "0", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": -8.08656113718457, \\\"other\\\": 7.483152033140179}\"" + }, + { + "timestamp": 1595901142326, + "trialJobId": "y3owq", + "parameterId": "1", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 6.896445698700774}}\"" + }, + { + "timestamp": 1595901142355, + "trialJobId": "etEUl", + "parameterId": "2", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"3.727416078457388\\\"\"" + }, + { + "timestamp": 1595901142458, + "trialJobId": "JxX0I", + "parameterId": "4", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 8.963738323502998}}\"" + }, + { + "timestamp": 1595901142548, + "trialJobId": "ywQvm", + "parameterId": "5", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"4.024454725511186\"" + }, + { + "timestamp": 1595901142758, + "trialJobId": "MQlPp", + "parameterId": "9", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901142760, + "trialJobId": "LKVCX", + "parameterId": "8", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"0.982665154375141\\\"\"" + }, + { + "timestamp": 1595901143239, + "trialJobId": "sXvMz", + "parameterId": "0", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -8.10531137074138}}\"" + }, + { + "timestamp": 1595901143362, + "trialJobId": "etEUl", + "parameterId": "2", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901143462, + "trialJobId": "JxX0I", + "parameterId": "4", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"other\\\": -5.433157293214572}\"" + }, + { + "timestamp": 1595901143552, + "trialJobId": "ywQvm", + "parameterId": "5", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901143761, + "trialJobId": "MQlPp", + "parameterId": "9", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 8.927687040316364}\"" + }, + { + "timestamp": 1595901143764, + "trialJobId": "LKVCX", + "parameterId": "8", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901144556, + "trialJobId": "ywQvm", + "parameterId": "5", + "type": "FINAL", + "sequence": 0, + "data": "\"-4.804921436452929\"" + }, + { + "timestamp": 1595901144765, + "trialJobId": "MQlPp", + "parameterId": "9", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901156846, + "trialJobId": "fJHIW", + "parameterId": "10", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -6.3254623036120545, \\\"other\\\": 6.661583778582873}\"" + }, + { + "timestamp": 1595901156921, + "trialJobId": "z7WgL", + "parameterId": "13", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901156954, + "trialJobId": "Ofyt2", + "parameterId": "12", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"1.7787198770217199\"" + }, + { + "timestamp": 1595901157264, + "trialJobId": "aKV3K", + "parameterId": "17", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901157336, + "trialJobId": "EFsFo", + "parameterId": "19", + "type": "FINAL", + "sequence": 0, + "data": "\"-0.9452602480917385\"" + }, + { + "timestamp": 1595901157852, + "trialJobId": "fJHIW", + "parameterId": "10", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{}\"" + }, + { + "timestamp": 1595901157925, + "trialJobId": "z7WgL", + "parameterId": "13", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901157959, + "trialJobId": "Ofyt2", + "parameterId": "12", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901158930, + "trialJobId": "z7WgL", + "parameterId": "13", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 1.8045794393579122}\"" + }, + { + "timestamp": 1595901158961, + "trialJobId": "Ofyt2", + "parameterId": "12", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 8.685460178518326}}\"" + }, + { + "timestamp": 1595901159931, + "trialJobId": "z7WgL", + "parameterId": "13", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": -7.794922103589295}\"" + }, + { + "timestamp": 1595901159966, + "trialJobId": "Ofyt2", + "parameterId": "12", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 7.483634247448858}}\"" + }, + { + "timestamp": 1595901160970, + "trialJobId": "Ofyt2", + "parameterId": "12", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"-3.905177892216985\\\"\"" + }, + { + "timestamp": 1595901172384, + "trialJobId": "dUJTL", + "parameterId": "20", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901172404, + "trialJobId": "xAoeQ", + "parameterId": "21", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": 4.1133250278375915, \\\"other\\\": -2.4983824090454387}\"" + }, + { + "timestamp": 1595901172422, + "trialJobId": "de6XT", + "parameterId": "22", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -4.1861178094861495, \\\"other\\\": -1.8025564533646659}\"" + }, + { + "timestamp": 1595901172467, + "trialJobId": "Rofrb", + "parameterId": "24", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 8.461683188282755}\"" + }, + { + "timestamp": 1595901172471, + "trialJobId": "MOOrR", + "parameterId": "25", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -6.970771731370702}}\"" + }, + { + "timestamp": 1595901172521, + "trialJobId": "A7C0a", + "parameterId": "23", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901172629, + "trialJobId": "p2m5y", + "parameterId": "26", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-4.874888030632755\"" + }, + { + "timestamp": 1595901172661, + "trialJobId": "mSPRF", + "parameterId": "27", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901172822, + "trialJobId": "G5nv9", + "parameterId": "28", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901172910, + "trialJobId": "ciSWN", + "parameterId": "29", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901173388, + "trialJobId": "dUJTL", + "parameterId": "20", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901173408, + "trialJobId": "xAoeQ", + "parameterId": "21", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901173428, + "trialJobId": "de6XT", + "parameterId": "22", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901173472, + "trialJobId": "Rofrb", + "parameterId": "24", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -8.764260228545442, \\\"other\\\": 1.191253727619479}\"" + }, + { + "timestamp": 1595901173476, + "trialJobId": "MOOrR", + "parameterId": "25", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"-9.609870583781277\\\"\"" + }, + { + "timestamp": 1595901173524, + "trialJobId": "A7C0a", + "parameterId": "23", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901173633, + "trialJobId": "p2m5y", + "parameterId": "26", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"other\\\": 6.950769342806488}\"" + }, + { + "timestamp": 1595901173664, + "trialJobId": "mSPRF", + "parameterId": "27", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901173827, + "trialJobId": "G5nv9", + "parameterId": "28", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"8.83440457248663\\\"\"" + }, + { + "timestamp": 1595901173915, + "trialJobId": "ciSWN", + "parameterId": "29", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901174413, + "trialJobId": "xAoeQ", + "parameterId": "21", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"1.8151933189474878\"" + }, + { + "timestamp": 1595901174434, + "trialJobId": "de6XT", + "parameterId": "22", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 0.8472658215331563}\"" + }, + { + "timestamp": 1595901174481, + "trialJobId": "MOOrR", + "parameterId": "25", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{}\"" + }, + { + "timestamp": 1595901174636, + "trialJobId": "p2m5y", + "parameterId": "26", + "type": "FINAL", + "sequence": 0, + "data": "\"9.902729745066438\"" + }, + { + "timestamp": 1595901174667, + "trialJobId": "mSPRF", + "parameterId": "27", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"-2.5183912965656763\\\"\"" + }, + { + "timestamp": 1595901174831, + "trialJobId": "G5nv9", + "parameterId": "28", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"default\\\": 5.406128202621579, \\\"other\\\": -6.350852877668696}\"" + }, + { + "timestamp": 1595901175418, + "trialJobId": "xAoeQ", + "parameterId": "21", + "type": "FINAL", + "sequence": 0, + "data": "\"-8.43771544998285\"" + }, + { + "timestamp": 1595901175438, + "trialJobId": "de6XT", + "parameterId": "22", + "type": "FINAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901175485, + "trialJobId": "MOOrR", + "parameterId": "25", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"2.8954800063491586\\\"\"" + }, + { + "timestamp": 1595901175671, + "trialJobId": "mSPRF", + "parameterId": "27", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901175834, + "trialJobId": "G5nv9", + "parameterId": "28", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901176422, + "trialJobId": "xAoeQ", + "parameterId": "21", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901176489, + "trialJobId": "MOOrR", + "parameterId": "25", + "type": "FINAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901176838, + "trialJobId": "G5nv9", + "parameterId": "28", + "type": "FINAL", + "sequence": 0, + "data": "\"3.5218235306581356\"" + }, + { + "timestamp": 1595901187944, + "trialJobId": "zaTFd", + "parameterId": "33", + "type": "FINAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901187975, + "trialJobId": "WrtVY", + "parameterId": "30", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": 6.446947454739629, \\\"other\\\": 4.2394889873504695}\"" + }, + { + "timestamp": 1595901188002, + "trialJobId": "RZ45L", + "parameterId": "32", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"8.53321110060542\"" + }, + { + "timestamp": 1595901188047, + "trialJobId": "Ss6eU", + "parameterId": "34", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -2.6862091423857564, \\\"other\\\": 8.839298350682931}\"" + }, + { + "timestamp": 1595901188087, + "trialJobId": "J5lYo", + "parameterId": "31", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901188183, + "trialJobId": "tb6Tr", + "parameterId": "35", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901188267, + "trialJobId": "ZMzvY", + "parameterId": "36", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901188275, + "trialJobId": "PNJDQ", + "parameterId": "39", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"\\\"6.952238868136657\\\"\"" + }, + { + "timestamp": 1595901188309, + "trialJobId": "VFEj6", + "parameterId": "37", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 5.022546354803907}\"" + }, + { + "timestamp": 1595901188338, + "trialJobId": "mcAWe", + "parameterId": "38", + "type": "FINAL", + "sequence": 0, + "data": "\"5.528136238632005\"" + }, + { + "timestamp": 1595901188979, + "trialJobId": "WrtVY", + "parameterId": "30", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"-4.361541180657595\"" + }, + { + "timestamp": 1595901189006, + "trialJobId": "RZ45L", + "parameterId": "32", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"2.781596441148668\"" + }, + { + "timestamp": 1595901189053, + "trialJobId": "Ss6eU", + "parameterId": "34", + "type": "FINAL", + "sequence": 0, + "data": "\"-3.3592681835773286\"" + }, + { + "timestamp": 1595901189093, + "trialJobId": "J5lYo", + "parameterId": "31", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"-8.400621787401052\\\"\"" + }, + { + "timestamp": 1595901189269, + "trialJobId": "ZMzvY", + "parameterId": "36", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901189279, + "trialJobId": "PNJDQ", + "parameterId": "39", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 2.1053341113677}}\"" + }, + { + "timestamp": 1595901189343, + "trialJobId": "mcAWe", + "parameterId": "38", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901189984, + "trialJobId": "WrtVY", + "parameterId": "30", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"default\\\": -9.29846727931565, \\\"other\\\": -3.5575764805061}\"" + }, + { + "timestamp": 1595901190056, + "trialJobId": "Ss6eU", + "parameterId": "34", + "type": "FINAL", + "sequence": 0, + "data": "\"6.581757373301858\"" + }, + { + "timestamp": 1595901190098, + "trialJobId": "J5lYo", + "parameterId": "31", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901190273, + "trialJobId": "ZMzvY", + "parameterId": "36", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"other\\\": 3.39802649436532}\"" + }, + { + "timestamp": 1595901190283, + "trialJobId": "PNJDQ", + "parameterId": "39", + "type": "FINAL", + "sequence": 0, + "data": "\"-1.8105252216174517\"" + }, + { + "timestamp": 1595901190988, + "trialJobId": "WrtVY", + "parameterId": "30", + "type": "FINAL", + "sequence": 0, + "data": "\"9.357586503792628\"" + }, + { + "timestamp": 1595901191101, + "trialJobId": "J5lYo", + "parameterId": "31", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -4.784440856817207}}\"" + }, + { + "timestamp": 1595901191277, + "trialJobId": "ZMzvY", + "parameterId": "36", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901191287, + "trialJobId": "PNJDQ", + "parameterId": "39", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 3.1762929944716927}\"" + }, + { + "timestamp": 1595901192106, + "trialJobId": "J5lYo", + "parameterId": "31", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 9.239821847210145}\"" + }, + { + "timestamp": 1595901203447, + "trialJobId": "B0prO", + "parameterId": "41", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"\\\"-2.479154993253598\\\"\"" + }, + { + "timestamp": 1595901203492, + "trialJobId": "ggpj9", + "parameterId": "43", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901203506, + "trialJobId": "ta3sm", + "parameterId": "40", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 9.24069239452307}\"" + }, + { + "timestamp": 1595901203549, + "trialJobId": "IZ5SL", + "parameterId": "44", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"-0.9142325374848674\\\"\"" + }, + { + "timestamp": 1595901203646, + "trialJobId": "MInUq", + "parameterId": "45", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901203705, + "trialJobId": "YWceT", + "parameterId": "46", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901203869, + "trialJobId": "idTj5", + "parameterId": "47", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 6.469325465919496}\"" + }, + { + "timestamp": 1595901203924, + "trialJobId": "LLkId", + "parameterId": "49", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901204452, + "trialJobId": "B0prO", + "parameterId": "41", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"5.017413317607618\\\"\"" + }, + { + "timestamp": 1595901204496, + "trialJobId": "ggpj9", + "parameterId": "43", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"4.245668260906047\\\"\"" + }, + { + "timestamp": 1595901204511, + "trialJobId": "ta3sm", + "parameterId": "40", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901204553, + "trialJobId": "IZ5SL", + "parameterId": "44", + "type": "FINAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901204651, + "trialJobId": "MInUq", + "parameterId": "45", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"0.25272592242281533\\\"\"" + }, + { + "timestamp": 1595901204710, + "trialJobId": "YWceT", + "parameterId": "46", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -6.908361500557971}}\"" + }, + { + "timestamp": 1595901204874, + "trialJobId": "idTj5", + "parameterId": "47", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901204929, + "trialJobId": "LLkId", + "parameterId": "49", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -1.7045174390916955, \\\"other\\\": 9.14883282895672}\"" + }, + { + "timestamp": 1595901205501, + "trialJobId": "ggpj9", + "parameterId": "43", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{}\"" + }, + { + "timestamp": 1595901205516, + "trialJobId": "ta3sm", + "parameterId": "40", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": 1.7906001963110256, \\\"other\\\": 0.7111312975095512}\"" + }, + { + "timestamp": 1595901205714, + "trialJobId": "YWceT", + "parameterId": "46", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"other\\\": -5.9019641918541055}\"" + }, + { + "timestamp": 1595901205878, + "trialJobId": "idTj5", + "parameterId": "47", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"other\\\": 2.823040228107409}\"" + }, + { + "timestamp": 1595901206505, + "trialJobId": "ggpj9", + "parameterId": "43", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 4.182220602389556}}\"" + }, + { + "timestamp": 1595901206882, + "trialJobId": "idTj5", + "parameterId": "47", + "type": "FINAL", + "sequence": 0, + "data": "\"-4.4221564350515274\"" + }, + { + "timestamp": 1595901219027, + "trialJobId": "ZbXHn", + "parameterId": "52", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901219044, + "trialJobId": "En80l", + "parameterId": "51", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901219069, + "trialJobId": "l99Rx", + "parameterId": "50", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -9.014544365514672}}\"" + }, + { + "timestamp": 1595901219082, + "trialJobId": "ZnEue", + "parameterId": "54", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901219271, + "trialJobId": "elkq7", + "parameterId": "55", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 5.0465681238608}\"" + }, + { + "timestamp": 1595901219313, + "trialJobId": "eE79m", + "parameterId": "56", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901219316, + "trialJobId": "glY0F", + "parameterId": "57", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -7.127630310607653}}\"" + }, + { + "timestamp": 1595901219433, + "trialJobId": "RQQYv", + "parameterId": "58", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901219441, + "trialJobId": "mYziy", + "parameterId": "59", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-5.80358242411701\"" + }, + { + "timestamp": 1595901220032, + "trialJobId": "ZbXHn", + "parameterId": "52", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901220048, + "trialJobId": "En80l", + "parameterId": "51", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": 2.6298452518160538, \\\"other\\\": -0.7910217651464624}\"" + }, + { + "timestamp": 1595901220075, + "trialJobId": "l99Rx", + "parameterId": "50", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"5.408334907304216\\\"\"" + }, + { + "timestamp": 1595901220086, + "trialJobId": "ZnEue", + "parameterId": "54", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"other\\\": -1.0699150731178424}\"" + }, + { + "timestamp": 1595901220276, + "trialJobId": "elkq7", + "parameterId": "55", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901220437, + "trialJobId": "RQQYv", + "parameterId": "58", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": 5.991596839229384, \\\"other\\\": -5.791984484999113}\"" + }, + { + "timestamp": 1595901221036, + "trialJobId": "ZbXHn", + "parameterId": "52", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"-5.745821975821488\"" + }, + { + "timestamp": 1595901221079, + "trialJobId": "l99Rx", + "parameterId": "50", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"other\\\": -2.8010065229085024}\"" + }, + { + "timestamp": 1595901221280, + "trialJobId": "elkq7", + "parameterId": "55", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"default\\\": -7.3137237874911705, \\\"other\\\": -7.995517504106601}\"" + }, + { + "timestamp": 1595901221441, + "trialJobId": "RQQYv", + "parameterId": "58", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"\\\"-1.3969094674689302\\\"\"" + }, + { + "timestamp": 1595901222041, + "trialJobId": "ZbXHn", + "parameterId": "52", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901222284, + "trialJobId": "elkq7", + "parameterId": "55", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 8.087980079012624}}\"" + }, + { + "timestamp": 1595901222446, + "trialJobId": "RQQYv", + "parameterId": "58", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901223290, + "trialJobId": "elkq7", + "parameterId": "55", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 4.0721479933987474}\"" + }, + { + "timestamp": 1595901223449, + "trialJobId": "RQQYv", + "parameterId": "58", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901234574, + "trialJobId": "VmI7f", + "parameterId": "60", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": -1.820722180261674}\"" + }, + { + "timestamp": 1595901234697, + "trialJobId": "VSWkZ", + "parameterId": "63", + "type": "FINAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901234732, + "trialJobId": "EZUe0", + "parameterId": "62", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901234745, + "trialJobId": "zHVA2", + "parameterId": "64", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"\\\"5.633256430913928\\\"\"" + }, + { + "timestamp": 1595901234746, + "trialJobId": "a1MOX", + "parameterId": "61", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901234806, + "trialJobId": "u8t3k", + "parameterId": "66", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901234898, + "trialJobId": "OuLsc", + "parameterId": "65", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": 8.765460410421554, \\\"other\\\": 6.246732298977708}\"" + }, + { + "timestamp": 1595901234949, + "trialJobId": "eGrff", + "parameterId": "67", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901234962, + "trialJobId": "Ujc39", + "parameterId": "68", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": -1.1268667476429037}\"" + }, + { + "timestamp": 1595901235066, + "trialJobId": "wg7hy", + "parameterId": "69", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901235579, + "trialJobId": "VmI7f", + "parameterId": "60", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 5.584444622345025}\"" + }, + { + "timestamp": 1595901235703, + "trialJobId": "VSWkZ", + "parameterId": "63", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"-3.8928731668227456\\\"\"" + }, + { + "timestamp": 1595901235735, + "trialJobId": "EZUe0", + "parameterId": "62", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": -9.739986972816562, \\\"other\\\": -0.357173900732942}\"" + }, + { + "timestamp": 1595901235743, + "trialJobId": "zHVA2", + "parameterId": "64", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{}\"" + }, + { + "timestamp": 1595901235747, + "trialJobId": "a1MOX", + "parameterId": "61", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -2.2757225430354033}}\"" + }, + { + "timestamp": 1595901235809, + "trialJobId": "u8t3k", + "parameterId": "66", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"2.6146452798999604\"" + }, + { + "timestamp": 1595901235903, + "trialJobId": "OuLsc", + "parameterId": "65", + "type": "FINAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901235953, + "trialJobId": "eGrff", + "parameterId": "67", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901235967, + "trialJobId": "Ujc39", + "parameterId": "68", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"5.879185588587639\"" + }, + { + "timestamp": 1595901236070, + "trialJobId": "wg7hy", + "parameterId": "69", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{}\"" + }, + { + "timestamp": 1595901236584, + "trialJobId": "VmI7f", + "parameterId": "60", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901236740, + "trialJobId": "EZUe0", + "parameterId": "62", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 1.091872290620957}}\"" + }, + { + "timestamp": 1595901236749, + "trialJobId": "zHVA2", + "parameterId": "64", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901236813, + "trialJobId": "u8t3k", + "parameterId": "66", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"8.789904140828813\"" + }, + { + "timestamp": 1595901236956, + "trialJobId": "eGrff", + "parameterId": "67", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{}\"" + }, + { + "timestamp": 1595901236971, + "trialJobId": "Ujc39", + "parameterId": "68", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901237078, + "trialJobId": "wg7hy", + "parameterId": "69", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901237754, + "trialJobId": "zHVA2", + "parameterId": "64", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"6.067140454523734\\\"\"" + }, + { + "timestamp": 1595901237817, + "trialJobId": "u8t3k", + "parameterId": "66", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": 6.738254874676596, \\\"other\\\": 3.407365737620623}\"" + }, + { + "timestamp": 1595901237960, + "trialJobId": "eGrff", + "parameterId": "67", + "type": "FINAL", + "sequence": 0, + "data": "\"-8.799684391921716\"" + }, + { + "timestamp": 1595901250174, + "trialJobId": "rCq1z", + "parameterId": "70", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901250225, + "trialJobId": "lCV9F", + "parameterId": "71", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -8.842244100829086, \\\"other\\\": -2.386236945799789}\"" + }, + { + "timestamp": 1595901250260, + "trialJobId": "IiCNj", + "parameterId": "72", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901250333, + "trialJobId": "e9bF7", + "parameterId": "74", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 1.0496251402087449}}\"" + }, + { + "timestamp": 1595901250335, + "trialJobId": "QPHAP", + "parameterId": "73", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 2.32920853708144}\"" + }, + { + "timestamp": 1595901250425, + "trialJobId": "XURUT", + "parameterId": "75", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -1.0644703377033373, \\\"other\\\": -9.313141516349681}\"" + }, + { + "timestamp": 1595901250515, + "trialJobId": "QafjF", + "parameterId": "76", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901250567, + "trialJobId": "ognsb", + "parameterId": "77", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901250642, + "trialJobId": "bsqIF", + "parameterId": "78", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-8.862311093464992\"" + }, + { + "timestamp": 1595901250807, + "trialJobId": "OiDp3", + "parameterId": "79", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901251178, + "trialJobId": "rCq1z", + "parameterId": "70", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"6.248246468866515\"" + }, + { + "timestamp": 1595901251229, + "trialJobId": "lCV9F", + "parameterId": "71", + "type": "FINAL", + "sequence": 0, + "data": "\"-4.715082675645508\"" + }, + { + "timestamp": 1595901251264, + "trialJobId": "IiCNj", + "parameterId": "72", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901251338, + "trialJobId": "e9bF7", + "parameterId": "74", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -5.9184589897185536}}\"" + }, + { + "timestamp": 1595901251339, + "trialJobId": "QPHAP", + "parameterId": "73", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901251429, + "trialJobId": "XURUT", + "parameterId": "75", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901251519, + "trialJobId": "QafjF", + "parameterId": "76", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"2.8245498927483315\"" + }, + { + "timestamp": 1595901251572, + "trialJobId": "ognsb", + "parameterId": "77", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"-8.995058023766827\\\"\"" + }, + { + "timestamp": 1595901251646, + "trialJobId": "bsqIF", + "parameterId": "78", + "type": "FINAL", + "sequence": 0, + "data": "\"0.8160856187719805\"" + }, + { + "timestamp": 1595901251812, + "trialJobId": "OiDp3", + "parameterId": "79", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901252345, + "trialJobId": "QPHAP", + "parameterId": "73", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901252345, + "trialJobId": "e9bF7", + "parameterId": "74", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901252434, + "trialJobId": "XURUT", + "parameterId": "75", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901252523, + "trialJobId": "QafjF", + "parameterId": "76", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"other\\\": 1.0945194250479169}\"" + }, + { + "timestamp": 1595901252575, + "trialJobId": "ognsb", + "parameterId": "77", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901253438, + "trialJobId": "XURUT", + "parameterId": "75", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901253527, + "trialJobId": "QafjF", + "parameterId": "76", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": 2.541955892720406, \\\"other\\\": -4.377781317201417}\"" + }, + { + "timestamp": 1595901265903, + "trialJobId": "wkVrB", + "parameterId": "82", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901265939, + "trialJobId": "bQhQx", + "parameterId": "81", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": -7.7245400592833535}\"" + }, + { + "timestamp": 1595901266079, + "trialJobId": "VstNm", + "parameterId": "84", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -6.8832078296276, \\\"other\\\": -7.67458445595935}\"" + }, + { + "timestamp": 1595901266081, + "trialJobId": "GRUrH", + "parameterId": "85", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901266190, + "trialJobId": "RzOte", + "parameterId": "87", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -2.6660304258057117}}\"" + }, + { + "timestamp": 1595901266208, + "trialJobId": "Sb2tj", + "parameterId": "86", + "type": "FINAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901266317, + "trialJobId": "NB1ou", + "parameterId": "89", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"default\\\": -6.409147608132686, \\\"other\\\": 3.281989187926694}\"" + }, + { + "timestamp": 1595901266894, + "trialJobId": "wkVrB", + "parameterId": "82", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{}\"" + }, + { + "timestamp": 1595901266944, + "trialJobId": "bQhQx", + "parameterId": "81", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901267086, + "trialJobId": "GRUrH", + "parameterId": "85", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901267194, + "trialJobId": "RzOte", + "parameterId": "87", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": -5.66892041315209, \\\"other\\\": 2.3283168851019287}\"" + }, + { + "timestamp": 1595901267322, + "trialJobId": "NB1ou", + "parameterId": "89", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901267898, + "trialJobId": "wkVrB", + "parameterId": "82", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{}\"" + }, + { + "timestamp": 1595901268197, + "trialJobId": "RzOte", + "parameterId": "87", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901268902, + "trialJobId": "wkVrB", + "parameterId": "82", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"-8.643383856668647\\\"\"" + }, + { + "timestamp": 1595901269906, + "trialJobId": "wkVrB", + "parameterId": "82", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"5.674545659125172\\\"\"" + }, + { + "timestamp": 1595901281386, + "trialJobId": "utKiW", + "parameterId": "90", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901281478, + "trialJobId": "uPdSU", + "parameterId": "92", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901281487, + "trialJobId": "dPeSr", + "parameterId": "91", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901281537, + "trialJobId": "pCXHB", + "parameterId": "93", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901281579, + "trialJobId": "okxUn", + "parameterId": "95", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": -2.023400446813328}\"" + }, + { + "timestamp": 1595901281605, + "trialJobId": "QO9EO", + "parameterId": "94", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"{\\\"other\\\": -7.37571843162065}\"" + }, + { + "timestamp": 1595901281742, + "trialJobId": "MrNMC", + "parameterId": "97", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"\\\"-0.03936926816899877\\\"\"" + }, + { + "timestamp": 1595901281745, + "trialJobId": "zZquy", + "parameterId": "99", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"\\\"-5.576159193327383\\\"\"" + }, + { + "timestamp": 1595901281765, + "trialJobId": "tXlrm", + "parameterId": "98", + "type": "PERIODICAL", + "sequence": 0, + "data": "\"\\\"2.8878423268479807\\\"\"" + }, + { + "timestamp": 1595901282391, + "trialJobId": "utKiW", + "parameterId": "90", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"-Infinity\"" + }, + { + "timestamp": 1595901282484, + "trialJobId": "uPdSU", + "parameterId": "92", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + }, + { + "timestamp": 1595901282540, + "trialJobId": "pCXHB", + "parameterId": "93", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"other\\\": 1.9549026751543792}\"" + }, + { + "timestamp": 1595901282583, + "trialJobId": "okxUn", + "parameterId": "95", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + }, + { + "timestamp": 1595901282611, + "trialJobId": "QO9EO", + "parameterId": "94", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"-3.3552487681830145\"" + }, + { + "timestamp": 1595901282746, + "trialJobId": "MrNMC", + "parameterId": "97", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"{\\\"default\\\": 3.641608082036285, \\\"other\\\": -8.348670460176342}\"" + }, + { + "timestamp": 1595901282749, + "trialJobId": "zZquy", + "parameterId": "99", + "type": "PERIODICAL", + "sequence": 1, + "data": "\"\\\"7.462277129845603\\\"\"" + }, + { + "timestamp": 1595901283396, + "trialJobId": "utKiW", + "parameterId": "90", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901283488, + "trialJobId": "uPdSU", + "parameterId": "92", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 5.491624461329462}\"" + }, + { + "timestamp": 1595901283544, + "trialJobId": "pCXHB", + "parameterId": "93", + "type": "FINAL", + "sequence": 0, + "data": "\"-6.732500732566957\"" + }, + { + "timestamp": 1595901283618, + "trialJobId": "QO9EO", + "parameterId": "94", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + }, + { + "timestamp": 1595901283751, + "trialJobId": "MrNMC", + "parameterId": "97", + "type": "PERIODICAL", + "sequence": 2, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -3.38303756126507}}\"" + }, + { + "timestamp": 1595901284400, + "trialJobId": "utKiW", + "parameterId": "90", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -5.853483580522072}}\"" + }, + { + "timestamp": 1595901284547, + "trialJobId": "pCXHB", + "parameterId": "93", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": -8.024581781976966}\"" + }, + { + "timestamp": 1595901284621, + "trialJobId": "QO9EO", + "parameterId": "94", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"-2.538557600204168\\\"\"" + }, + { + "timestamp": 1595901284756, + "trialJobId": "MrNMC", + "parameterId": "97", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": -0.5479590285769511}\"" + }, + { + "timestamp": 1595901285760, + "trialJobId": "MrNMC", + "parameterId": "97", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + } + ], + "trialJobs": [ + { + "id": "sXvMz", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":0,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-normal\",\"intermediate2\":\"dict-normal\",\"intermediate3\":\"normal\",\"intermediate_count\":2,\"final1\":\"dict-defaultdict\",\"final2\":\"dict-nodefault\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/sXvMz", + "startTime": 1595901139862, + "sequenceId": 0, + "endTime": 1595901143276, + "finalMetricData": [ + { + "timestamp": 1595901143239, + "trialJobId": "sXvMz", + "parameterId": "0", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": -8.10531137074138}}\"" + } + ] + }, + { + "id": "y3owq", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":1,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"string\",\"intermediate2\":\"dict-nodefault\",\"intermediate3\":\"dict-empty\",\"intermediate_count\":1,\"final1\":\"dict-defaultdict\",\"final2\":\"neginf\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/y3owq", + "startTime": 1595901139885, + "sequenceId": 1, + "endTime": 1595901142374, + "finalMetricData": [ + { + "timestamp": 1595901142326, + "trialJobId": "y3owq", + "parameterId": "1", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 6.896445698700774}}\"" + } + ] + }, + { + "id": "etEUl", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":2,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"neginf\",\"intermediate2\":\"inf\",\"intermediate3\":\"string\",\"intermediate_count\":1,\"final1\":\"string\",\"final2\":\"inf\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/etEUl", + "startTime": 1595901139903, + "sequenceId": 2, + "endTime": 1595901143415, + "finalMetricData": [ + { + "timestamp": 1595901142355, + "trialJobId": "etEUl", + "parameterId": "2", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"3.727416078457388\\\"\"" + } + ] + }, + { + "id": "r5pwY", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":3,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"inf\",\"intermediate2\":\"dict-empty\",\"intermediate3\":\"dict-nodefault\",\"intermediate_count\":0,\"final1\":\"dict-empty\",\"final2\":\"dict-normal\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/r5pwY", + "startTime": 1595901139918, + "sequenceId": 3, + "endTime": 1595901141410, + "finalMetricData": [ + { + "timestamp": 1595901141374, + "trialJobId": "r5pwY", + "parameterId": "3", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + } + ] + }, + { + "id": "JxX0I", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":4,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"nan\",\"intermediate2\":\"dict-defaultdict\",\"intermediate3\":\"dict-nodefault\",\"intermediate_count\":3,\"final1\":\"dict-defaultdict\",\"final2\":\"string\",\"final_count\":0},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/JxX0I", + "startTime": 1595901139981, + "sequenceId": 4, + "endTime": 1595901143497 + }, + { + "id": "ywQvm", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":5,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-defaultdict\",\"intermediate2\":\"normal\",\"intermediate3\":\"inf\",\"intermediate_count\":3,\"final1\":\"normal\",\"final2\":\"dict-nodefault\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/ywQvm", + "startTime": 1595901140071, + "sequenceId": 5, + "endTime": 1595901144603, + "finalMetricData": [ + { + "timestamp": 1595901144556, + "trialJobId": "ywQvm", + "parameterId": "5", + "type": "FINAL", + "sequence": 0, + "data": "\"-4.804921436452929\"" + } + ] + }, + { + "id": "tkxcP", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":6,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"nan\",\"intermediate2\":\"dict-empty\",\"intermediate3\":\"inf\",\"intermediate_count\":1,\"final1\":\"nan\",\"final2\":\"nan\",\"final_count\":0},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/tkxcP", + "startTime": 1595901140239, + "sequenceId": 6, + "endTime": 1595901141679 + }, + { + "id": "MjX3O", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":7,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-defaultdict\",\"intermediate2\":\"dict-nodefault\",\"intermediate3\":\"inf\",\"intermediate_count\":0,\"final1\":\"dict-empty\",\"final2\":\"dict-defaultdict\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/MjX3O", + "startTime": 1595901134932, + "sequenceId": 7, + "endTime": 1595901141756, + "finalMetricData": [ + { + "timestamp": 1595901141708, + "trialJobId": "MjX3O", + "parameterId": "7", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + } + ] + }, + { + "id": "LKVCX", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":8,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"inf\",\"intermediate2\":\"string\",\"intermediate3\":\"dict-defaultdict\",\"intermediate_count\":2,\"final1\":\"inf\",\"final2\":\"dict-defaultdict\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/LKVCX", + "startTime": 1595901134943, + "sequenceId": 8, + "endTime": 1595901143811, + "finalMetricData": [ + { + "timestamp": 1595901143764, + "trialJobId": "LKVCX", + "parameterId": "8", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + } + ] + }, + { + "id": "MQlPp", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":9,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-defaultdict\",\"intermediate2\":\"nan\",\"intermediate3\":\"dict-defaultdict\",\"intermediate_count\":2,\"final1\":\"dict-nodefault\",\"final2\":\"inf\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/MQlPp", + "startTime": 1595901134954, + "sequenceId": 9, + "endTime": 1595901144801, + "finalMetricData": [ + { + "timestamp": 1595901143761, + "trialJobId": "MQlPp", + "parameterId": "9", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 8.927687040316364}\"" + } + ] + }, + { + "id": "fJHIW", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":10,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-normal\",\"intermediate2\":\"dict-empty\",\"intermediate3\":\"nan\",\"intermediate_count\":2,\"final1\":\"neginf\",\"final2\":\"nan\",\"final_count\":0},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/fJHIW", + "startTime": 1595901155426, + "sequenceId": 10, + "endTime": 1595901157888 + }, + { + "id": "RDBG2", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":11,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"normal\",\"intermediate2\":\"string\",\"intermediate3\":\"dict-nodefault\",\"intermediate_count\":0,\"final1\":\"inf\",\"final2\":\"neginf\",\"final_count\":0},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/RDBG2", + "startTime": 1595901155437, + "sequenceId": 11, + "endTime": 1595901155983 + }, + { + "id": "Ofyt2", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":12,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"normal\",\"intermediate2\":\"inf\",\"intermediate3\":\"dict-defaultdict\",\"intermediate_count\":3,\"final1\":\"dict-defaultdict\",\"final2\":\"string\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/Ofyt2", + "startTime": 1595901155450, + "sequenceId": 12, + "endTime": 1595901161007, + "finalMetricData": [ + { + "timestamp": 1595901159966, + "trialJobId": "Ofyt2", + "parameterId": "12", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"default\\\": {\\\"tensor\\\": 0, \\\"data\\\": 7.483634247448858}}\"" + } + ] + }, + { + "id": "z7WgL", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":13,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"nan\",\"intermediate2\":\"nan\",\"intermediate3\":\"dict-normal\",\"intermediate_count\":2,\"final1\":\"dict-nodefault\",\"final2\":\"dict-nodefault\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/z7WgL", + "startTime": 1595901155464, + "sequenceId": 13, + "endTime": 1595901159968, + "finalMetricData": [ + { + "timestamp": 1595901158930, + "trialJobId": "z7WgL", + "parameterId": "13", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 1.8045794393579122}\"" + } + ] + }, + { + "id": "OotJc", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":14,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-empty\",\"intermediate2\":\"inf\",\"intermediate3\":\"dict-empty\",\"intermediate_count\":0,\"final1\":\"dict-nodefault\",\"final2\":\"dict-empty\",\"final_count\":0},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/OotJc", + "startTime": 1595901155490, + "sequenceId": 14, + "endTime": 1595901156062 + }, + { + "id": "WxWLk", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":15,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"nan\",\"intermediate2\":\"normal\",\"intermediate3\":\"string\",\"intermediate_count\":0,\"final1\":\"nan\",\"final2\":\"dict-empty\",\"final_count\":0},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/WxWLk", + "startTime": 1595901155536, + "sequenceId": 15, + "endTime": 1595901156147 + }, + { + "id": "Zzazj", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":16,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-empty\",\"intermediate2\":\"dict-normal\",\"intermediate3\":\"normal\",\"intermediate_count\":0,\"final1\":\"neginf\",\"final2\":\"nan\",\"final_count\":0},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/Zzazj", + "startTime": 1595901155644, + "sequenceId": 16, + "endTime": 1595901156212 + }, + { + "id": "aKV3K", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":17,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-defaultdict\",\"intermediate2\":\"inf\",\"intermediate3\":\"normal\",\"intermediate_count\":0,\"final1\":\"nan\",\"final2\":\"inf\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/aKV3K", + "startTime": 1595901155795, + "sequenceId": 17, + "endTime": 1595901157314, + "finalMetricData": [ + { + "timestamp": 1595901157264, + "trialJobId": "aKV3K", + "parameterId": "17", + "type": "FINAL", + "sequence": 0, + "data": "\"NaN\"" + } + ] + }, + { + "id": "WR5fG", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":18,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"inf\",\"intermediate2\":\"dict-empty\",\"intermediate3\":\"inf\",\"intermediate_count\":0,\"final1\":\"normal\",\"final2\":\"nan\",\"final_count\":0},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/WR5fG", + "startTime": 1595901155825, + "sequenceId": 18, + "endTime": 1595901156321 + }, + { + "id": "EFsFo", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":19,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"inf\",\"intermediate2\":\"dict-nodefault\",\"intermediate3\":\"nan\",\"intermediate_count\":0,\"final1\":\"normal\",\"final2\":\"dict-defaultdict\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/EFsFo", + "startTime": 1595901150521, + "sequenceId": 19, + "endTime": 1595901157386, + "finalMetricData": [ + { + "timestamp": 1595901157336, + "trialJobId": "EFsFo", + "parameterId": "19", + "type": "FINAL", + "sequence": 0, + "data": "\"-0.9452602480917385\"" + } + ] + }, + { + "id": "dUJTL", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":20,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"inf\",\"intermediate2\":\"string\",\"intermediate3\":\"inf\",\"intermediate_count\":1,\"final1\":\"inf\",\"final2\":\"string\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/dUJTL", + "startTime": 1595901170933, + "sequenceId": 20, + "endTime": 1595901173438, + "finalMetricData": [ + { + "timestamp": 1595901173388, + "trialJobId": "dUJTL", + "parameterId": "20", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + } + ] + }, + { + "id": "xAoeQ", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":21,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-normal\",\"intermediate2\":\"neginf\",\"intermediate3\":\"normal\",\"intermediate_count\":3,\"final1\":\"normal\",\"final2\":\"inf\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/xAoeQ", + "startTime": 1595901170945, + "sequenceId": 21, + "endTime": 1595901176469, + "finalMetricData": [ + { + "timestamp": 1595901175418, + "trialJobId": "xAoeQ", + "parameterId": "21", + "type": "FINAL", + "sequence": 0, + "data": "\"-8.43771544998285\"" + } + ] + }, + { + "id": "de6XT", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":22,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-normal\",\"intermediate2\":\"nan\",\"intermediate3\":\"string\",\"intermediate_count\":2,\"final1\":\"dict-nodefault\",\"final2\":\"neginf\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/de6XT", + "startTime": 1595901170962, + "sequenceId": 22, + "endTime": 1595901175475, + "finalMetricData": [ + { + "timestamp": 1595901174434, + "trialJobId": "de6XT", + "parameterId": "22", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 0.8472658215331563}\"" + } + ] + }, + { + "id": "A7C0a", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":23,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"nan\",\"intermediate2\":\"normal\",\"intermediate3\":\"neginf\",\"intermediate_count\":1,\"final1\":\"dict-empty\",\"final2\":\"dict-defaultdict\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/A7C0a", + "startTime": 1595901170977, + "sequenceId": 23, + "endTime": 1595901173571, + "finalMetricData": [ + { + "timestamp": 1595901173524, + "trialJobId": "A7C0a", + "parameterId": "23", + "type": "FINAL", + "sequence": 0, + "data": "\"{}\"" + } + ] + }, + { + "id": "Rofrb", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":24,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"inf\",\"intermediate2\":\"dict-nodefault\",\"intermediate3\":\"neginf\",\"intermediate_count\":0,\"final1\":\"dict-nodefault\",\"final2\":\"dict-normal\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/Rofrb", + "startTime": 1595901170990, + "sequenceId": 24, + "endTime": 1595901173521, + "finalMetricData": [ + { + "timestamp": 1595901172467, + "trialJobId": "Rofrb", + "parameterId": "24", + "type": "FINAL", + "sequence": 0, + "data": "\"{\\\"other\\\": 8.461683188282755}\"" + } + ] + }, + { + "id": "MOOrR", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":25,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"dict-defaultdict\",\"intermediate2\":\"string\",\"intermediate3\":\"dict-empty\",\"intermediate_count\":3,\"final1\":\"string\",\"final2\":\"neginf\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/MOOrR", + "startTime": 1595901171032, + "sequenceId": 25, + "endTime": 1595901176529, + "finalMetricData": [ + { + "timestamp": 1595901175485, + "trialJobId": "MOOrR", + "parameterId": "25", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"2.8954800063491586\\\"\"" + } + ] + }, + { + "id": "p2m5y", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":26,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"normal\",\"intermediate2\":\"dict-nodefault\",\"intermediate3\":\"inf\",\"intermediate_count\":2,\"final1\":\"normal\",\"final2\":\"dict-defaultdict\",\"final_count\":1},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/p2m5y", + "startTime": 1595901171129, + "sequenceId": 26, + "endTime": 1595901174686, + "finalMetricData": [ + { + "timestamp": 1595901174636, + "trialJobId": "p2m5y", + "parameterId": "26", + "type": "FINAL", + "sequence": 0, + "data": "\"9.902729745066438\"" + } + ] + }, + { + "id": "mSPRF", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":27,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"neginf\",\"intermediate2\":\"nan\",\"intermediate3\":\"dict-nodefault\",\"intermediate_count\":2,\"final1\":\"string\",\"final2\":\"dict-empty\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/mSPRF", + "startTime": 1595901171272, + "sequenceId": 27, + "endTime": 1595901175714, + "finalMetricData": [ + { + "timestamp": 1595901174667, + "trialJobId": "mSPRF", + "parameterId": "27", + "type": "FINAL", + "sequence": 0, + "data": "\"\\\"-2.5183912965656763\\\"\"" + } + ] + }, + { + "id": "G5nv9", + "status": "SUCCEEDED", + "hyperParameters": [ + "{\"parameter_id\":28,\"parameter_source\":\"algorithm\",\"parameters\":{\"intermediate1\":\"neginf\",\"intermediate2\":\"string\",\"intermediate3\":\"dict-normal\",\"intermediate_count\":3,\"final1\":\"inf\",\"final2\":\"normal\",\"final_count\":2},\"parameter_index\":0}" + ], + "logPath": "file://localhost:/***/nni/experiments/Tkaxm2mb/trials/G5nv9", + "startTime": 1595901171428, + "sequenceId": 28, + "endTime": 1595901176884, + "finalMetricData": [ + { + "timestamp": 1595901175834, + "trialJobId": "G5nv9", + "parameterId": "28", + "type": "FINAL", + "sequence": 0, + "data": "\"Infinity\"" + } + ] + } + ] +} diff --git a/src/webui/src/App.tsx b/src/webui/src/App.tsx index 2405b02de0..690efc23c4 100644 --- a/src/webui/src/App.tsx +++ b/src/webui/src/App.tsx @@ -15,13 +15,13 @@ interface AppState { isillegalFinal: boolean; expWarningMessage: string; bestTrialEntries: string; // for overview page: best trial entreis + isUpdate: boolean; } class App extends React.Component<{}, AppState> { private timerId!: number | undefined; private dataFormatimer!: number; private firstLoad: boolean = false; // when click refresh selector options - constructor(props: {}) { super(props); this.state = { @@ -32,16 +32,19 @@ class App extends React.Component<{}, AppState> { metricGraphMode: 'max', isillegalFinal: false, expWarningMessage: '', - bestTrialEntries: '10' + bestTrialEntries: '10', + isUpdate: true }; } async componentDidMount(): Promise { await Promise.all([EXPERIMENT.init(), TRIALS.init()]); - this.setState(state => ({ experimentUpdateBroadcast: state.experimentUpdateBroadcast + 1 })); - this.setState(state => ({ trialsUpdateBroadcast: state.trialsUpdateBroadcast + 1 })); - this.timerId = window.setTimeout(this.refresh, this.state.interval * 1000); - this.setState({ metricGraphMode: (EXPERIMENT.optimizeMode === 'minimize' ? 'min' : 'max') }); + this.setState(state => ({ + experimentUpdateBroadcast: state.experimentUpdateBroadcast + 1, + trialsUpdateBroadcast: state.trialsUpdateBroadcast + 1, + metricGraphMode: (EXPERIMENT.optimizeMode === 'minimize' ? 'min' : 'max') + })); + this.timerId = window.setTimeout(this.refresh, this.state.interval * 100); // final result is legal // get a succeed trial,see final result data's format // eslint-disable-next-line @typescript-eslint/no-non-null-assertion @@ -99,6 +102,15 @@ class App extends React.Component<{}, AppState> { this.setState({ bestTrialEntries: entries }); } + shouldComponentUpdate(nextProps: any, nextState: AppState): boolean { + + if(!(nextState.isUpdate || nextState.isUpdate === undefined)){ + nextState.isUpdate = true; + return false; + } + return true; + } + render(): React.ReactNode { const { interval, columnList, experimentUpdateBroadcast, trialsUpdateBroadcast, metricGraphMode, isillegalFinal, expWarningMessage, bestTrialEntries @@ -106,7 +118,6 @@ class App extends React.Component<{}, AppState> { if (experimentUpdateBroadcast === 0 || trialsUpdateBroadcast === 0) { return null; // TODO: render a loading page } - const errorList = [ { errorWhere: TRIALS.jobListError(), errorMessage: TRIALS.getJobErrorMessage() }, { errorWhere: EXPERIMENT.experimentError(), errorMessage: EXPERIMENT.getExperimentMessage() }, @@ -158,7 +169,6 @@ class App extends React.Component<{}, AppState> { } private refresh = async (): Promise => { - // resolve this question: 10s -> 20s, page refresh twice. // only refresh this page after clicking the refresh options if (this.firstLoad !== true) { @@ -177,8 +187,7 @@ class App extends React.Component<{}, AppState> { // experiment status and /trial-jobs api's status could decide website update if (['DONE', 'ERROR', 'STOPPED'].includes(EXPERIMENT.status) || TRIALS.jobListError()) { // experiment finished, refresh once more to ensure consistency - this.setState({ interval: 0 }); - this.lastRefresh(); + this.setState(() => ({ interval: 0, isUpdate: false })); return; } @@ -189,8 +198,7 @@ class App extends React.Component<{}, AppState> { public async lastRefresh(): Promise { await EXPERIMENT.update(); await TRIALS.update(true); - this.setState(state => ({ experimentUpdateBroadcast: state.experimentUpdateBroadcast + 1 })); - this.setState(state => ({ trialsUpdateBroadcast: state.trialsUpdateBroadcast + 1 })); + this.setState(state => ({ experimentUpdateBroadcast: state.experimentUpdateBroadcast + 1, trialsUpdateBroadcast: state.trialsUpdateBroadcast + 1 })); } } diff --git a/src/webui/src/components/Modals/Compare.tsx b/src/webui/src/components/Modals/Compare.tsx index ac73f860bc..44f7afd0fb 100644 --- a/src/webui/src/components/Modals/Compare.tsx +++ b/src/webui/src/components/Modals/Compare.tsx @@ -1,11 +1,17 @@ import * as React from 'react'; -import { Stack, Modal, IconButton } from 'office-ui-fabric-react'; +import { Stack, Modal, IconButton, IDragOptions, ContextualMenu } from 'office-ui-fabric-react'; import ReactEcharts from 'echarts-for-react'; import IntermediateVal from '../public-child/IntermediateVal'; import { TRIALS } from '../../static/datamodel'; +import { TableRecord, Intermedia, TooltipForIntermediate } from '../../static/interface'; import { contentStyles, iconButtonStyles } from '../Buttons/ModalTheme'; import '../../static/style/compare.scss'; -import { TableRecord, Intermedia, TooltipForIntermediate } from '../../static/interface'; + +const dragOptions: IDragOptions = { + moveMenuItemText: 'Move', + closeMenuItemText: 'Close', + menu: ContextualMenu +}; // the modal of trial compare interface CompareProps { @@ -79,7 +85,8 @@ class Compare extends React.Component { containLabel: true }, legend: { - data: idsList + // more than 10 trials will hide legend + data: idsList.length > 10 ? null : idsList }, xAxis: { type: 'category', @@ -209,6 +216,8 @@ class Compare extends React.Component { isOpen={true} containerClassName={contentStyles.container} className="compare-modal" + allowTouchBodyScroll={true} + dragOptions={dragOptions} >
diff --git a/src/webui/src/components/Modals/ExperimentDrawer.tsx b/src/webui/src/components/Modals/ExperimentPanel.tsx similarity index 56% rename from src/webui/src/components/Modals/ExperimentDrawer.tsx rename to src/webui/src/components/Modals/ExperimentPanel.tsx index 142af89f59..cbc674ec9a 100644 --- a/src/webui/src/components/Modals/ExperimentDrawer.tsx +++ b/src/webui/src/components/Modals/ExperimentPanel.tsx @@ -1,17 +1,16 @@ import * as React from 'react'; -import axios from 'axios'; import { downFile } from '../../static/function'; import { Stack, PrimaryButton, DefaultButton, Panel, StackItem, Pivot, PivotItem } from 'office-ui-fabric-react'; -import { MANAGER_IP, DRAWEROPTION } from '../../static/const'; +import { DRAWEROPTION } from '../../static/const'; +import { EXPERIMENT, TRIALS } from '../../static/datamodel'; import MonacoEditor from 'react-monaco-editor'; import '../../static/style/logDrawer.scss'; -import { TrialManager } from '../../static/model/trialmanager'; interface ExpDrawerProps { - isVisble: boolean; closeExpDrawer: () => void; + experimentProfile: object; } interface ExpDrawerState { @@ -21,7 +20,9 @@ interface ExpDrawerState { class ExperimentDrawer extends React.Component { - public _isCompareMount!: boolean; + public _isExperimentMount!: boolean; + private refreshId!: number | undefined; + constructor(props: ExpDrawerProps) { super(props); @@ -32,42 +33,40 @@ class ExperimentDrawer extends React.Component { } getExperimentContent = (): void => { - axios - .all([ - axios.get(`${MANAGER_IP}/experiment`), - axios.get(`${MANAGER_IP}/trial-jobs`), - axios.get(`${MANAGER_IP}/metric-data`) - ]) - .then(axios.spread((resExperiment, resTrialJobs, resMetricData) => { - if (resExperiment.status === 200 && resTrialJobs.status === 200 && resMetricData.status === 200) { - if (resExperiment.data.params.searchSpace) { - resExperiment.data.params.searchSpace = JSON.parse(resExperiment.data.params.searchSpace); - } - const trialMessagesArr = TrialManager.expandJobsToTrials(resTrialJobs.data); - const interResultList = resMetricData.data; - Object.keys(trialMessagesArr).map(item => { - // not deal with trial's hyperParameters - const trialId = trialMessagesArr[item].id; - // add intermediate result message - trialMessagesArr[item].intermediate = []; - Object.keys(interResultList).map(key => { - const interId = `${interResultList[key].trialJobId}-${interResultList[key].parameterId}`; - if (trialId === interId) { - trialMessagesArr[item].intermediate.push(interResultList[key]); - } - }); - }); - const result = { - experimentParameters: resExperiment.data, - trialMessage: trialMessagesArr - }; - if (this._isCompareMount === true) { - this.setState({ experiment: JSON.stringify(result, null, 4) }); - } + const experimentData = JSON.parse(JSON.stringify(this.props.experimentProfile)); + if (experimentData.params.searchSpace) { + experimentData.params.searchSpace = JSON.parse(experimentData.params.searchSpace); + } + const trialMessagesArr = TRIALS.getTrialJobList(); + const interResultList = TRIALS.getMetricsList(); + Object.keys(trialMessagesArr).map(item => { + // not deal with trial's hyperParameters + const trialId = trialMessagesArr[item].jobId; + // add intermediate result message + trialMessagesArr[item].intermediate = []; + Object.keys(interResultList).map(key => { + const interId = interResultList[key].trialJobId; + if (trialId === interId) { + trialMessagesArr[item].intermediate.push(interResultList[key]); } - })); - } + }); + }); + const result = { + experimentParameters: experimentData, + trialMessage: trialMessagesArr + }; + if (this._isExperimentMount === true) { + this.setState({ experiment: JSON.stringify(result, null, 4) }); + } + if (['DONE', 'ERROR', 'STOPPED'].includes(EXPERIMENT.status)) { + if(this.refreshId !== null || this.refreshId !== undefined){ + window.clearInterval(this.refreshId); + } + } + + } + downExperimentParameters = (): void => { const { experiment } = this.state; downFile(experiment, 'experiment.json'); @@ -78,31 +77,28 @@ class ExperimentDrawer extends React.Component { } componentDidMount(): void { - this._isCompareMount = true; + this._isExperimentMount = true; this.getExperimentContent(); + this.refreshId = window.setInterval(this.getExperimentContent, 10000); window.addEventListener('resize', this.onWindowResize); } - componentWillReceiveProps(nextProps: ExpDrawerProps): void { - const { isVisble } = nextProps; - if (isVisble === true) { - this.getExperimentContent(); - } - } - componentWillUnmount(): void { - this._isCompareMount = false; + this._isExperimentMount = false; + window.clearTimeout(this.refreshId); window.removeEventListener('resize', this.onWindowResize); } render(): React.ReactNode { - const { isVisble, closeExpDrawer } = this.props; + const { closeExpDrawer } = this.props; const { experiment, expDrawerHeight } = this.state; return ( diff --git a/src/webui/src/components/Modals/Killjob.tsx b/src/webui/src/components/Modals/Killjob.tsx index 580ff5ff24..2f4c7a1833 100644 --- a/src/webui/src/components/Modals/Killjob.tsx +++ b/src/webui/src/components/Modals/Killjob.tsx @@ -77,7 +77,7 @@ class KillJob extends React.Component { onKill = (): void => { this.setState({ isCalloutVisible: false }, () => { const { trial } = this.props; - killJob(trial.key, trial.jobId, trial.status); + killJob(trial.key, trial.id, trial.status); }); } diff --git a/src/webui/src/components/Modals/LogDrawer.tsx b/src/webui/src/components/Modals/LogPanel.tsx similarity index 98% rename from src/webui/src/components/Modals/LogDrawer.tsx rename to src/webui/src/components/Modals/LogPanel.tsx index a54b0f4c25..97a408fe9d 100644 --- a/src/webui/src/components/Modals/LogDrawer.tsx +++ b/src/webui/src/components/Modals/LogPanel.tsx @@ -92,6 +92,8 @@ class LogDrawer extends React.Component { isOpen={true} hasCloseButton={false} isFooterAtBottom={true} + isLightDismiss={true} + onLightDismissClick={closeDrawer} >
{ openDocs = (): void => { window.open(WEBUIDOC); } - + openGithubNNI = (): void => { - const {version} = this.state; + const { version } = this.state; const nniLink = `https://github.com/Microsoft/nni/tree/${version}`; window.open(nniLink); } @@ -178,8 +179,8 @@ class NavCon extends React.Component { {/* the drawer for dispatcher & nnimanager log message */} - {isvisibleLogDrawer && } - + {isvisibleLogDrawer && } + {isvisibleExperimentDrawer && } ); } diff --git a/src/webui/src/components/Overview.tsx b/src/webui/src/components/Overview.tsx index 54e5f5ba5c..b2e00efae7 100644 --- a/src/webui/src/components/Overview.tsx +++ b/src/webui/src/components/Overview.tsx @@ -12,6 +12,18 @@ import TrialInfo from './overview/TrialProfile'; import '../static/style/overview.scss'; import '../static/style/logPath.scss'; +const stackTokens: IStackTokens = { + childrenGap: 30, +}; + +const entriesOption = [ + { key: '10', text: 'Display top 10 trials' }, + { key: '20', text: 'Display top 20 trials' }, + { key: '30', text: 'Display top 30 trials' }, + { key: '50', text: 'Display top 50 trials' }, + { key: '100', text: 'Display top 100 trials' } +]; + interface OverviewProps { experimentUpdateBroadcast: number; trialsUpdateBroadcast: number; @@ -70,17 +82,6 @@ class Overview extends React.Component { const titleMaxbgcolor = (metricGraphMode === 'max' ? '#333' : '#b3b3b3'); const titleMinbgcolor = (metricGraphMode === 'min' ? '#333' : '#b3b3b3'); - const stackTokens: IStackTokens = { - childrenGap: 30, - }; - - const entriesOption = [ - { key: '10', text: 'Display top 10 trials' }, - { key: '20', text: 'Display top 20 trials' }, - { key: '30', text: 'Display top 30 trials' }, - { key: '50', text: 'Display top 50 trials' }, - { key: '100', text: 'Display top 100 trials' } - ]; return (
{/* status and experiment block */} @@ -123,7 +124,7 @@ class Overview extends React.Component { - +
{ >
-
+
{ } } + openTrialLog = (type: string): void => { + window.open(`${MANAGER_IP}/trial-log/${this.props.trialId}/${type}`); + } + render(): React.ReactNode { const { isHidenInfo, typeInfo, info } = this.state; const trialId = this.props.trialId; @@ -105,7 +111,23 @@ class OpenRow extends React.Component { logCollection={EXPERIMENT.logCollectionEnabled} /> : - +
+ + {/* view each trial log in drawer*/} +
+
+ + +
+
+
} diff --git a/src/webui/src/components/trial-detail/Para.tsx b/src/webui/src/components/trial-detail/Para.tsx index f2f130949a..f6fd0a7884 100644 --- a/src/webui/src/components/trial-detail/Para.tsx +++ b/src/webui/src/components/trial-detail/Para.tsx @@ -162,21 +162,32 @@ class Para extends React.Component { const scale = this.convertToD3Scale(v); if (k === primaryMetricKey && scale !== undefined && scale.interpolate) { // set color for primary metrics - colorScale = this.convertToD3Scale(v, false) - .range(['green', 'red']) - .interpolate(d3.interpolateHsl); - colorDim = k; + // `colorScale` is used to produce a color range, while `scale` is to produce a pixel range + colorScale = this.convertToD3Scale(v, false); + convertedTrials.sort((a, b) => EXPERIMENT.optimizeMode === 'minimize' ? a[k] - b[k] : b[k] - a[k]); // filter top trials if (percent != 1) { const keptTrialNum = Math.max(Math.ceil(convertedTrials.length * percent), 1); - convertedTrials.sort((a, b) => EXPERIMENT.optimizeMode === 'minimize' ? a[k] - b[k] : b[k] - a[k]); convertedTrials = convertedTrials.slice(0, keptTrialNum); const domain = d3.extent(convertedTrials, item => item[k]); scale.domain([domain[0], domain[1]]); + colorScale.domain([domain[0], domain[1]]); if (colorScale !== undefined) { colorScale.domain(domain); } } + // reverse the converted trials to show the top ones upfront + convertedTrials.reverse(); + const assignColors = (scale: any): void => { + scale.range([0, 1]); // fake a range to perform invert + const [scaleMin, scaleMax] = scale.domain(); + const pivot = scale.invert(0.5); + scale.domain([scaleMin, pivot, scaleMax]) + .range(['#90EE90', '#FFC400', '#CA0000']) + .interpolate(d3.interpolateHsl); + }; + assignColors(colorScale); + colorDim = k; } dimensions.push([k, { type: 'number', @@ -184,7 +195,7 @@ class Para extends React.Component { }]); } - if (convertedTrials.length === 0) { + if (convertedTrials.length === 0 || dimensions.length <= 1) { return; } diff --git a/src/webui/src/components/trial-detail/TableList.tsx b/src/webui/src/components/trial-detail/TableList.tsx index 94af7e8fb9..ee11f0fdec 100644 --- a/src/webui/src/components/trial-detail/TableList.tsx +++ b/src/webui/src/components/trial-detail/TableList.tsx @@ -75,7 +75,7 @@ interface TableListState { tableSourceForSort: Array; sortMessage: SortInfo; offset: number; - data: Array; + tablePerPage: Array; perPage: number; currentPage: number; pageCount: number; @@ -111,7 +111,7 @@ class TableList extends React.Component { allColumnList: this.getAllColumnKeys(), sortMessage: { field: '', isDescend: false }, offset: 0, - data: [], + tablePerPage: [], perPage: 20, currentPage: 0, pageCount: 0, @@ -121,7 +121,7 @@ class TableList extends React.Component { // sort for table column onColumnClick = (ev: React.MouseEvent, getColumn: IColumn): void => { - const { tableColumns, tableSourceForSort } = this.state; + const { tableColumns } = this.state; const newColumns: IColumn[] = tableColumns.slice(); const currColumn: IColumn = newColumns.filter(item => getColumn.key === item.key)[0]; newColumns.forEach((newCol: IColumn) => { @@ -133,26 +133,12 @@ class TableList extends React.Component { newCol.isSortedDescending = true; } }); - // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - const newItems = this.copyAndSort(tableSourceForSort, currColumn.fieldName!, currColumn.isSortedDescending); + this.setState({ tableColumns: newColumns, - tableSourceForSort: newItems, sortMessage: { field: getColumn.key, isDescend: currColumn.isSortedDescending } - }); - - }; - - private copyAndSort(items: T[], columnKey: string, isSortedDescending?: boolean): any { - const key = columnKey as keyof T; - return items.slice(0).sort(function (a: T, b: T): any { - if (a[key] === undefined) { - return 1; - } - if (b[key] === undefined) { - return -1; - } - return (isSortedDescending ? a[key] < b[key] : a[key] > b[key]) ? 1 : -1; + }, () => { + this.updateData(); }); } @@ -269,7 +255,7 @@ class TableList extends React.Component { showIntermediateModal = async (record: TrialJobInfo, event: React.SyntheticEvent): Promise => { event.preventDefault(); event.stopPropagation(); - const res = await axios.get(`${MANAGER_IP}/metric-data/${record.jobId}`); + const res = await axios.get(`${MANAGER_IP}/metric-data/${record.id}`); if (res.status === 200) { const intermediateArr: number[] = []; // support intermediate result is dict because the last intermediate result is @@ -277,14 +263,10 @@ class TableList extends React.Component { // get intermediate result dict keys array const { intermediateKey } = this.state; const otherkeys: string[] = []; - // One trial job may contains multiple parameter id - // only show current trial's metric data - const metricDatas = res.data.filter(item => { - return item.parameterId == record.parameterId; - }); + const metricDatas = res.data; if (metricDatas.length !== 0) { // just add type=number keys - const intermediateMetrics = parseMetrics(res.data[0].data); + const intermediateMetrics = parseMetrics(metricDatas[0].data); for (const key in intermediateMetrics) { if (typeof intermediateMetrics[key] === 'number') { otherkeys.push(key); @@ -571,61 +553,86 @@ class TableList extends React.Component { componentDidMount(): void { window.addEventListener('resize', this.onWindowResize); - this.updateData() + this.updateData(); } componentDidUpdate(prevProps: TableListProps): void { - if (this.props.columnList !== prevProps.columnList || this.props.tableSource !== prevProps.tableSource) { + if (this.props.columnList !== prevProps.columnList || this.props.tableSource !== prevProps.tableSource || prevProps.trialsUpdateBroadcast !== this.props.trialsUpdateBroadcast) { const { columnList } = this.props; this.setState({ tableColumns: this.initTableColumnList(columnList), allColumnList: this.getAllColumnKeys() - }, () => {this.updateData(); - }); + }, () => { + this.updateData(); + }); } } + // slice all table data into current page data updateData(): void { - const tableSource: Array = JSON.parse(JSON.stringify(this.props.tableSource)); + const tableSource: Array = this.props.tableSource; + const { offset, perPage, sortMessage } = this.state; + + if (sortMessage.field !== '') { + tableSource.sort(function (a, b): any { + if (a[sortMessage.field] === undefined || Object.is(a[sortMessage.field], NaN) || Object.is(a[sortMessage.field], Infinity) || Object.is(a[sortMessage.field], -Infinity) || typeof a[sortMessage.field] === 'object' ) { + return 1; + } + if (b[sortMessage.field] === undefined || Object.is(b[sortMessage.field], NaN) || Object.is(b[sortMessage.field], Infinity) || Object.is(b[sortMessage.field], -Infinity) || typeof b[sortMessage.field] === 'object' ) { + return -1; + } + return (sortMessage.isDescend ? a[sortMessage.field] < b[sortMessage.field] : a[sortMessage.field] > b[sortMessage.field]) ? 1 : -1; + }); + } - const tableSlice = tableSource.slice(this.state.offset, this.state.offset + this.state.perPage) - + const tableSlice = tableSource.slice(offset, offset + perPage) + const curPageCount = Math.ceil(tableSource.length / perPage) this.setState({ - tableSourceForSort: tableSlice, - pageCount: Math.ceil(tableSource.length / this.state.perPage), + tablePerPage: tableSlice, + pageCount: curPageCount, }); } + // update data when click the page index of pagination handlePageClick = (evt: any): void => { const selectedPage = evt.selected; const offset = selectedPage * this.state.perPage; this.setState({ currentPage: selectedPage, - offset: offset }, - () => { this.updateData(); + offset: offset + }, () => { + this.updateData(); }); } - updateperPage = (event: React.FormEvent, item: IDropdownOption | undefined): void => { - // clear input value and re-render table + // update per page items when click the dropdown of pagination + updatePerPage = (event: React.FormEvent, item: IDropdownOption | undefined): void => { + const { pageCount } = this.state; + if (item !== undefined) { + const currentPerPage = item.key === 'all' ? this.props.tableSource.length: Number(item.key) + const currentPageCount = this.props.tableSource.length <= currentPerPage ? 1 : pageCount + this.setState({ - perPage: item.key === 'all' ? this.props.tableSource.length: Number(item.key) }, - () => {this.updateData(); + perPage: currentPerPage, + offset: 0, + currentPage: 0, + pageCount: currentPageCount + }, () => { + this.updateData(); }); } } - render(): React.ReactNode { const { intermediateKey, modalIntermediateWidth, modalIntermediateHeight, tableColumns, allColumnList, isShowColumn, modalVisible, selectRows, isShowCompareModal, intermediateOtherKeys, - isShowCustomizedModal, copyTrialId, intermediateOption, sortMessage + isShowCustomizedModal, copyTrialId, intermediateOption, + tablePerPage } = this.state; const { columnList } = this.props; - const tableSource = this.state.tableSourceForSort const perPageOptions = [ { key: '10', text: '10 items per page'}, { key: '20', text: '20 items per page'}, @@ -633,25 +640,12 @@ class TableList extends React.Component { { key: 'all', text: 'All items'}, ]; - - if (sortMessage.field !== '') { - tableSource.sort(function (a, b): any { - if (a[sortMessage.field] === undefined) { - return 1; - } - if (b[sortMessage.field] === undefined) { - return -1; - } - return (sortMessage.isDescend ? a[sortMessage.field] < b[sortMessage.field] : a[sortMessage.field] > b[sortMessage.field]) ? 1 : -1; - }); - } - return (
{ - {/* this.props.tableSource.length > this.state.perPage && */} "} @@ -680,11 +673,11 @@ class TableList extends React.Component { containerClassName={(this.props.tableSource.length == 0 ? "pagination hidden" : "pagination" )} subContainerClassName={"pages pagination"} disableInitialCallback={false} - activeClassName={"active"}/> - + activeClassName={"active"} + forcePage={this.state.currentPage} + /> - {/* /> */}
{/* Intermediate Result Modal */} { style={{ width: 0.5 * modalIntermediateWidth, height: 0.7 * modalIntermediateHeight, + maxHeight: 534, padding: 20 }} theme="my_theme" diff --git a/src/webui/src/static/function.ts b/src/webui/src/static/function.ts index 485585e224..fda4cd89cf 100644 --- a/src/webui/src/static/function.ts +++ b/src/webui/src/static/function.ts @@ -51,7 +51,7 @@ const convertDuration = (num: number): string => { }; function parseMetrics(metricData: string): any { - if (metricData.includes('NaN')) { + if (metricData.includes('NaN') || metricData.includes('Infinity')) { return JSON5.parse(JSON5.parse(metricData)); } else { return JSON.parse(JSON.parse(metricData)); @@ -84,15 +84,18 @@ const getFinalResult = (final?: MetricDataRecord[]): number => { } }; +function isNaNorInfinity(val: number): boolean { + return Object.is(val, NaN) || Object.is(val, Infinity); +} + // get final result value // acc obj const getFinal = (final?: MetricDataRecord[]): FinalType | undefined => { let showDefault: FinalType; if (final) { showDefault = parseMetrics(final[final.length - 1].data); if (typeof showDefault === 'number') { - if(!isNaN(showDefault)){ - showDefault = { default: showDefault }; - return showDefault; + if(!isNaNorInfinity(showDefault)){ + return { default: showDefault }; } } else if (isArrayType(showDefault)) { // not support final type @@ -131,7 +134,8 @@ const intermediateGraphOption = (intermediateArr: number[], id: string): any => yAxis: { name: 'Default metric', type: 'value', - data: intermediateArr + data: intermediateArr, + scale: true }, series: [{ symbolSize: 6, @@ -164,11 +168,9 @@ const killJob = (key: number, id: string, status: string, updateList?: Function) .catch(error => { if (error.response.status === 500) { if (error.response.data.error) { - alert(123); - // message.error(error.response.data.error); + alert(error.response.data.error); } else { - alert(234); - // message.error('500 error, fail to cancel the job'); + alert('500 error, fail to cancel the job'); } } }); @@ -228,9 +230,17 @@ function formatAccuracy(accuracy: number): string { return accuracy.toFixed(6).replace(/0+$/, '').replace(/\.$/, ''); } +function formatComplexTypeValue(value: any): string | number { + if (['number', 'string'].includes(typeof value)) { + return value; + } else { + return value.toString(); + } +} + export { convertTime, convertDuration, getFinalResult, getFinal, downFile, intermediateGraphOption, killJob, filterByStatus, filterDuration, formatAccuracy, formatTimestamp, metricAccuracy, parseMetrics, - isArrayType, requestAxios + isArrayType, requestAxios, isNaNorInfinity, formatComplexTypeValue }; diff --git a/src/webui/src/static/interface.ts b/src/webui/src/static/interface.ts index c033c225f4..734921ead3 100644 --- a/src/webui/src/static/interface.ts +++ b/src/webui/src/static/interface.ts @@ -43,12 +43,10 @@ interface TableRecord { startTime: number; endTime?: number; id: string; - jobId: string; - parameterId: string; duration: number; status: string; intermediateCount: number; - accuracy?: number; + accuracy?: number | any; latestAccuracy: number | undefined; formattedLatestAccuracy: string; // format (LATEST/FINAL), accDictionary: FinalType | undefined; @@ -126,8 +124,6 @@ interface Intermedia { interface MetricDataRecord { timestamp: number; trialJobId: string; - trialId: string; - parameterId: string; type: string; sequence: number; data: string; @@ -135,8 +131,6 @@ interface MetricDataRecord { interface TrialJobInfo { id: string; - jobId: string; - parameterId: string; sequenceId: number; status: string; startTime?: number; diff --git a/src/webui/src/static/model/experiment.ts b/src/webui/src/static/model/experiment.ts index 2f899eccdf..3d8d088789 100644 --- a/src/webui/src/static/model/experiment.ts +++ b/src/webui/src/static/model/experiment.ts @@ -58,7 +58,7 @@ class Experiment { await requestAxios(`${MANAGER_IP}/experiment`) .then(data => { - updated = updated || compareProfiles(this.profileField, data); + updated = updated || !compareProfiles(this.profileField, data); this.profileField = data; }) .catch(error => { @@ -69,7 +69,7 @@ class Experiment { await requestAxios(`${MANAGER_IP}/check-status`) .then(data => { - updated = JSON.stringify(this.statusField) === JSON.stringify(data); + updated = JSON.stringify(this.statusField) !== JSON.stringify(data); this.statusField = data; }) .catch(error => { diff --git a/src/webui/src/static/model/searchspace.ts b/src/webui/src/static/model/searchspace.ts index 75006bc1cd..cd1cfede23 100644 --- a/src/webui/src/static/model/searchspace.ts +++ b/src/webui/src/static/model/searchspace.ts @@ -1,5 +1,6 @@ import { SingleAxis, MultipleAxes, TableObj } from '../interface'; import { SUPPORTED_SEARCH_SPACE_TYPE } from '../const'; +import { formatComplexTypeValue } from '../function'; function fullNameJoin(prefix: string, name: string): string { return prefix ? (prefix + '/' + name) : name; @@ -52,7 +53,7 @@ class SimpleOrdinalAxis implements SingleAxis { this.baseName = baseName; this.fullName = fullName; this.type = type; - this.domain = value; + this.domain = Array.from(value).map(formatComplexTypeValue); } } @@ -115,7 +116,7 @@ export class SearchSpace implements MultipleAxes { trial.parameters(searchSpace); } catch (unexpectedEntries) { // eslint-disable-next-line no-console - console.log(unexpectedEntries); + console.warn(unexpectedEntries); for (const [k, v] of unexpectedEntries as Map) { const column = addingColumns.get(k); if (column === undefined) { @@ -164,7 +165,7 @@ export class MetricSpace implements MultipleAxes { if (value.every(v => typeof v === 'number')) { this.axes.set(key, new NumericAxis(key, key, 'uniform', [Math.min(...value), Math.max(...value)])); } else { - // TODO: skip for now + this.axes.set(key, new SimpleOrdinalAxis(key, key, 'choice', value)); } }); } diff --git a/src/webui/src/static/model/trial.ts b/src/webui/src/static/model/trial.ts index ebdd35bc77..c5366fe8c9 100644 --- a/src/webui/src/static/model/trial.ts +++ b/src/webui/src/static/model/trial.ts @@ -1,5 +1,6 @@ +import * as JSON5 from 'json5'; import { MetricDataRecord, TrialJobInfo, TableObj, TableRecord, Parameters, FinalType, MultipleAxes, SingleAxis } from '../interface'; -import { getFinal, formatAccuracy, metricAccuracy, parseMetrics, isArrayType } from '../function'; +import { getFinal, formatAccuracy, metricAccuracy, parseMetrics, isArrayType, isNaNorInfinity, formatComplexTypeValue } from '../function'; /** * Get a structured representation of parameters @@ -27,10 +28,10 @@ function inferTrialParameters(paramObj: object, space: MultipleAxes, prefix: str subUnexpected.forEach((v, k) => unexpectedEntries.set(k, v)); } } else { - parameters.set(axisKey, v); + parameters.set(axisKey, formatComplexTypeValue(v)); } } else { - unexpectedEntries.set(prefix + k, v); + unexpectedEntries.set(prefix + k, formatComplexTypeValue(v)); } } return [parameters, unexpectedEntries]; @@ -110,21 +111,26 @@ class Trial implements TableObj { const endTime = this.info.endTime || new Date().getTime(); // eslint-disable-next-line @typescript-eslint/no-non-null-assertion const duration = (endTime - this.info.startTime!) / 1000; - + let accuracy; + if(this.acc !== undefined && this.acc.default !== undefined){ + if(typeof this.acc.default === 'number'){ + accuracy = JSON5.parse(this.acc.default); + }else { + accuracy = this.acc.default; + } + } + return { key: this.info.id, sequenceId: this.info.sequenceId, id: this.info.id, - jobId: this.info.jobId, - parameterId: this.info.parameterId, // eslint-disable-next-line @typescript-eslint/no-non-null-assertion startTime: this.info.startTime!, endTime: this.info.endTime, duration, status: this.info.status, intermediateCount: this.intermediates.length, - // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - accuracy: this.acc !== undefined ? JSON.parse(this.acc!.default) : undefined, + accuracy: accuracy, latestAccuracy: this.latestAccuracy, formattedLatestAccuracy: this.formatLatestAccuracy(), accDictionary: this.acc @@ -154,6 +160,9 @@ class Trial implements TableObj { } get acc(): FinalType | undefined { + if (this.info === undefined) { + return undefined; + } return getFinal(this.info.finalMetricData); } @@ -192,10 +201,10 @@ class Trial implements TableObj { } public parameters(axes: MultipleAxes): Map { - const tempHyper = this.info.hyperParameters; - if (tempHyper === undefined) { - throw new Map([['error', 'This trial\'s parameters are not available.']]); + if (this.info === undefined || this.info.hyperParameters === undefined) { + throw new Map(); } else { + const tempHyper = this.info.hyperParameters; let params = JSON.parse(tempHyper[tempHyper.length - 1]).parameters; if (typeof params === 'string') { params = JSON.parse(params); @@ -218,6 +227,7 @@ class Trial implements TableObj { Object.entries(acc).forEach(item => { const [k, v] = item; const column = space.axes.get(k); + if (column !== undefined) { ret.set(column, v); } else { @@ -235,8 +245,11 @@ class Trial implements TableObj { } public finalKeys(): string[] { - // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - return Object.keys(this.acc!); + if(this.acc !== undefined){ + return Object.keys(this.acc); + } else { + return []; + } } /* table obj end */ @@ -290,24 +303,34 @@ class Trial implements TableObj { return !same; } - public formatLatestAccuracy(): string { // TODO: this should be private - if (this.accuracy !== undefined) { - if (isNaN(this.accuracy)) { - return this.accuracy.toString(); + private renderNumber(val: any): string { + if(typeof val === 'number'){ + if (isNaNorInfinity(val)) { + return `${val}`; // show 'NaN' or 'Infinity' } else { - return `${formatAccuracy(this.accuracy)} (FINAL)`; + return `${formatAccuracy(val)} (FINAL)`; } - } else if (this.intermediates.length === 0) { - return '--'; } else { - // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - const latest = this.intermediates[this.intermediates.length - 1]!; - if (isNaN(metricAccuracy(latest))) { - return 'NaN'; + // show other types, such as {tensor: {data: }} + return JSON.stringify(val); + } + } + + public formatLatestAccuracy(): string { // TODO: this should be private + if(this.status === 'SUCCEEDED'){ + return (this.accuracy === undefined ? '--': this.renderNumber(this.accuracy)); + } else { + if (this.accuracy !== undefined) { + return this.renderNumber(this.accuracy); + } else if (this.intermediates.length === 0) { + return '--'; } else { - return `${formatAccuracy(metricAccuracy(latest))} (LATEST)`; + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion + const latest = this.intermediates[this.intermediates.length - 1]!; + return this.renderNumber(metricAccuracy(latest)); } } + } } diff --git a/src/webui/src/static/model/trialmanager.ts b/src/webui/src/static/model/trialmanager.ts index bc613e1ba1..ffc0f85f55 100644 --- a/src/webui/src/static/model/trialmanager.ts +++ b/src/webui/src/static/model/trialmanager.ts @@ -7,29 +7,13 @@ import { requestAxios } from '../function'; function groupMetricsByTrial(metrics: MetricDataRecord[]): Map { const ret = new Map(); for (const metric of metrics) { - const trialId = `${metric.trialJobId}-${metric.parameterId}`; - metric.trialId = trialId; - if (ret.has(trialId)) { + if (ret.has(metric.trialJobId)) { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - ret.get(trialId)!.push(metric); + ret.get(metric.trialJobId)!.push(metric); } else { - ret.set(trialId, [metric]); + ret.set(metric.trialJobId, [ metric ]); } } - // to compatiable with multi-trial in same job, fix offset of sequence - ret.forEach((trialMetrics) => { - let minSequenceNumber = Number.POSITIVE_INFINITY; - trialMetrics.map((item) => { - if (item.sequence < minSequenceNumber && item.type !== "FINAL") { - minSequenceNumber = item.sequence; - } - }); - trialMetrics.map((item) => { - if (item.type !== "FINAL") { - item.sequence -= minSequenceNumber; - } - }); - }); return ret; } @@ -48,6 +32,16 @@ class TrialManager { private latestMetricdataErrorMessage: string = ''; // metric-data-latest error message private isMetricdataRangeError: boolean = false; // metric-data-range api error filed private metricdataRangeErrorMessage: string = ''; // metric-data-latest error message + private metricsList: Array = []; + private trialJobList: Array = []; + + public getMetricsList(): Array { + return this.metricsList; + } + + public getTrialJobList(): Array { + return this.trialJobList; + } public async init(): Promise { while (!this.infoInitialized || !this.metricInitialized) { @@ -135,57 +129,6 @@ class TrialManager { return new MetricSpace([...this.trials.values()]); } - public static expandJobsToTrials(jobs: TrialJobInfo[]): TrialJobInfo[] { - const trials: TrialJobInfo[] = []; - - for (const jobInfo of jobs as TrialJobInfo[]) { - if (jobInfo.hyperParameters) { - let trial: TrialJobInfo | undefined; - let lastTrial: TrialJobInfo | undefined; - for (let i = 0; i < jobInfo.hyperParameters.length; i++) { - const hyperParameters = jobInfo.hyperParameters[i] - const hpObject = JSON.parse(hyperParameters); - const parameterId = hpObject["parameter_id"]; - trial = { - id: `${jobInfo.id}-${parameterId}`, - jobId: jobInfo.id, - parameterId: parameterId, - sequenceId: parameterId, - status: "SUCCEEDED", - startTime: jobInfo.startTime, - endTime: jobInfo.startTime, - hyperParameters: [hyperParameters], - logPath: jobInfo.logPath, - stderrPath: jobInfo.stderrPath, - }; - if (jobInfo.finalMetricData) { - for (const metricData of jobInfo.finalMetricData) { - if (metricData.parameterId == parameterId) { - trial.finalMetricData = [metricData]; - trial.endTime = metricData.timestamp; - break; - } - } - } - if (lastTrial) { - trial.startTime = lastTrial.endTime; - } else { - trial.startTime = jobInfo.startTime; - } - lastTrial = trial; - trials.push(trial); - } - if (lastTrial !== undefined) { - lastTrial.status = jobInfo.status; - lastTrial.endTime = jobInfo.endTime; - } - } else { - trials.push(jobInfo); - } - } - return trials; - } - // if this.jobListError = true, show trial error message [/trial-jobs] public jobListError(): boolean { return this.isJobListError; @@ -229,8 +172,7 @@ class TrialManager { let updated = false; requestAxios(`${MANAGER_IP}/trial-jobs`) .then(data => { - const newTrials = TrialManager.expandJobsToTrials(data as any); - for (const trialInfo of newTrials as TrialJobInfo[]) { + for (const trialInfo of data as TrialJobInfo[]) { if (this.trials.has(trialInfo.id)) { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion updated = this.trials.get(trialInfo.id)!.updateTrialJobInfo(trialInfo) || updated; @@ -265,7 +207,10 @@ class TrialManager { private async updateAllMetrics(): Promise { return requestAxios(`${MANAGER_IP}/metric-data`) - .then(data => this.doUpdateMetrics(data as any, false)) + .then(data => { + this.metricsList = data; + return this.doUpdateMetrics(data as any, false); + }) .catch(error => { this.isMetricdataError = true; this.MetricdataErrorMessage = `${error.message}`; diff --git a/src/webui/src/static/style/compare.scss b/src/webui/src/static/style/compare.scss index ba45ccac98..37f70a49c4 100644 --- a/src/webui/src/static/style/compare.scss +++ b/src/webui/src/static/style/compare.scss @@ -1,14 +1,17 @@ .compare-modal{ /* decide modal size */ .ms-Dialog-main{ - max-width: 70%; + width: 50%; + overflow: hidden; } /* compare-md: table style */ &-table{ width: 92%; - table-layout: fixed; margin: 0 auto; + margin-bottom: 20px; + border: 1px solid transparent; + overflow: auto; color: #333; tr{ line-height: 30px; diff --git a/src/webui/src/static/style/overview.scss b/src/webui/src/static/style/overview.scss index f636424fdd..162c878e5c 100644 --- a/src/webui/src/static/style/overview.scss +++ b/src/webui/src/static/style/overview.scss @@ -12,7 +12,7 @@ padding: 15px 20px; height: 100%; min-width: 500px; - overflow-y: scroll; + overflow-y: auto; } .padItem{ diff --git a/src/webui/src/static/style/succTable.scss b/src/webui/src/static/style/succTable.scss index 3e2dbdfa86..05b37035bd 100644 --- a/src/webui/src/static/style/succTable.scss +++ b/src/webui/src/static/style/succTable.scss @@ -1,6 +1,6 @@ #succTable{ height: 404px; - overflow-y: scroll; + overflow: auto; position: relative; .succTable-tooltip{ position: absolute; diff --git a/src/webui/src/static/style/table.scss b/src/webui/src/static/style/table.scss index d8f57dd424..25ca54e3e0 100644 --- a/src/webui/src/static/style/table.scss +++ b/src/webui/src/static/style/table.scss @@ -7,7 +7,7 @@ height: 324px; overflow: hidden; #succeTable .commonTableStyle{ - overflow-y: scroll; + overflow-y: auto; } } @@ -55,5 +55,5 @@ } .columns-height{ max-height: 335px; - overflow-y: scroll; + overflow-y: auto; } diff --git a/test/config/integration_tests.yml b/test/config/integration_tests.yml index b3802239da..dfd34e9ae2 100644 --- a/test/config/integration_tests.yml +++ b/test/config/integration_tests.yml @@ -140,8 +140,8 @@ testCases: config: maxTrialNum: 4 trialConcurrency: 4 - launchCommand: python3 -c 'import nnicli as nc; nc.start_nni("$configFile")' - stopCommand: python3 -c 'import nnicli as nc; nc.stop_nni()' + launchCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.start_experiment("$configFile")' + stopCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()' validator: class: NnicliValidator platform: linux darwin @@ -158,7 +158,7 @@ testCases: configFile: test/config/examples/sklearn-regression.yml setExperimentIdtoVar: $resumeExpId # for subfolder in codedir test - launchCommand: mkdir -p ../examples/trials/sklearn/regression/subfolder && touch ../examples/trials/sklearn/regression/subfolder/subfile && nnictl create --config $configFile --debug + launchCommand: python3 -c "import os; os.makedirs('../examples/trials/sklearn/regression/subfolder', exist_ok=True); open('../examples/trials/sklearn/regression/subfolder/subfile', 'a').close()" && nnictl create --config $configFile --debug # Experiment resume test part 2 - name: nnictl-resume-2 diff --git a/test/config/integration_tests_tf2.yml b/test/config/integration_tests_tf2.yml index e060511289..2002f36367 100644 --- a/test/config/integration_tests_tf2.yml +++ b/test/config/integration_tests_tf2.yml @@ -110,8 +110,8 @@ testCases: config: maxTrialNum: 4 trialConcurrency: 4 - launchCommand: python3 -c 'import nnicli as nc; nc.start_nni("$configFile")' - stopCommand: python3 -c 'import nnicli as nc; nc.stop_nni()' + launchCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.start_experiment("$configFile")' + stopCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()' validator: class: NnicliValidator platform: linux darwin diff --git a/test/config/pr_tests.yml b/test/config/pr_tests.yml index d49bf9d7ec..f82143b836 100644 --- a/test/config/pr_tests.yml +++ b/test/config/pr_tests.yml @@ -45,10 +45,10 @@ testCases: - name: nnicli configFile: test/config/examples/sklearn-regression.yml config: - maxTrialNum: 2 - trialConcurrency: 2 - launchCommand: python3 -c 'import nnicli as nc; nc.start_nni("$configFile")' - stopCommand: python3 -c 'import nnicli as nc; nc.stop_nni()' + maxTrialNum: 4 + trialConcurrency: 4 + launchCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.start_experiment("$configFile")' + stopCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()' validator: class: NnicliValidator platform: linux darwin diff --git a/test/nni_test/nnitest/remote_docker.py b/test/nni_test/nnitest/remote_docker.py index d0252e3feb..2c89c34374 100644 --- a/test/nni_test/nnitest/remote_docker.py +++ b/test/nni_test/nnitest/remote_docker.py @@ -37,7 +37,7 @@ def start_container(image, name, nnimanager_os): '''Start docker container, generate a port in /tmp/nnitest/{name}/port file''' port = find_port() source_dir = '/tmp/nnitest/' + name - run_cmds = ['docker', 'run', '-d', '-p', str(port) + ':22', '--name', name, '--mount', 'type=bind,source=' + source_dir + ',target=/tmp/nni', image] + run_cmds = ['docker', 'run', '-d', '-t', '-p', str(port) + ':22', '--name', name, '--mount', 'type=bind,source=' + source_dir + ',target=/tmp/nni', image] output = check_output(run_cmds) commit_id = output.decode('utf-8') @@ -57,7 +57,7 @@ def get_dist(wheel_name): else: return '/tmp/nni/dist/{0}'.format(wheel_name) - pip_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', '--upgrade', 'pip', 'setuptools==39.1.0'] + pip_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', '--upgrade', 'pip', 'setuptools==41.0.0'] check_call(pip_cmds) sdk_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', get_dist(wheel_name)] check_call(sdk_cmds) diff --git a/test/nni_test/nnitest/validators.py b/test/nni_test/nnitest/validators.py index 1cdadb8669..5ad9090c18 100644 --- a/test/nni_test/nnitest/validators.py +++ b/test/nni_test/nnitest/validators.py @@ -6,7 +6,7 @@ import subprocess import json import requests -import nnicli as nc +from nnicli import Experiment from utils import METRICS_URL @@ -80,8 +80,8 @@ def get_metric_results(self, metrics): class NnicliValidator(ITValidator): def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs): print(rest_endpoint) - nc.set_endpoint(rest_endpoint) - #print(nc.version()) - print(nc.get_job_statistics()) - print(nc.get_experiment_status()) - print(nc.list_trial_jobs()) + exp = Experiment() + exp.connect_experiment(rest_endpoint) + print(exp.get_job_statistics()) + print(exp.get_experiment_status()) + print(exp.list_trial_jobs()) diff --git a/test/pipelines/pipelines-it-local-tf2.yml b/test/pipelines/pipelines-it-local-tf2.yml index 26f3d4c87d..2b9526c142 100644 --- a/test/pipelines/pipelines-it-local-tf2.yml +++ b/test/pipelines/pipelines-it-local-tf2.yml @@ -10,7 +10,7 @@ jobs: displayName: 'Install nni toolkit via source code' - script: | set -e - python3 -m pip install scikit-learn==0.20.0 --user + python3 -m pip install scikit-learn==0.23.2 --user python3 -m pip install torch==1.3.1 torchvision==0.4.2 -f https://download.pytorch.org/whl/torch_stable.html --user python3 -m pip install tensorflow-gpu==2.2.0 tensorflow-estimator==2.2.0 --force --user python3 -m pip install keras==2.4.2 --user diff --git a/test/pipelines/pipelines-it-local-windows.yml b/test/pipelines/pipelines-it-local-windows.yml index bfdf4eed21..4b791c762c 100644 --- a/test/pipelines/pipelines-it-local-windows.yml +++ b/test/pipelines/pipelines-it-local-windows.yml @@ -7,7 +7,7 @@ jobs: powershell.exe -file install.ps1 displayName: 'Install nni toolkit via source code' - script: | - python -m pip install scikit-learn==0.20.0 --user + python -m pip install scikit-learn==0.23.2 --user python -m pip install keras==2.1.6 --user python -m pip install torchvision===0.4.1 torch===1.3.1 -f https://download.pytorch.org/whl/torch_stable.html --user python -m pip install tensorflow-gpu==1.15.2 tensorflow-estimator==1.15.1 --force --user diff --git a/test/pipelines/pipelines-it-local.yml b/test/pipelines/pipelines-it-local.yml index ec6735a473..eb72e4099d 100644 --- a/test/pipelines/pipelines-it-local.yml +++ b/test/pipelines/pipelines-it-local.yml @@ -10,7 +10,7 @@ jobs: displayName: 'Install nni toolkit via source code' - script: | set -e - python3 -m pip install scikit-learn==0.20.0 --user + python3 -m pip install scikit-learn==0.23.2 --user python3 -m pip install torchvision==0.4.2 --user python3 -m pip install torch==1.3.1 --user python3 -m pip install keras==2.1.6 --user diff --git a/test/pipelines/pipelines-it-pai-windows.yml b/test/pipelines/pipelines-it-pai-windows.yml index 3bdb4ee69b..1765868827 100644 --- a/test/pipelines/pipelines-it-pai-windows.yml +++ b/test/pipelines/pipelines-it-pai-windows.yml @@ -62,7 +62,7 @@ jobs: displayName: 'Install nni toolkit via source code' - script: | set PATH=$(ENV_PATH) - python -m pip install scikit-learn==0.21.0 --user + python -m pip install scikit-learn==0.23.2 --user displayName: 'Install dependencies for integration tests' - script: | cd test @@ -71,4 +71,4 @@ jobs: mount -o anon $(pai_nfs_uri) $(local_nfs_uri) python nni_test/nnitest/generate_ts_config.py --ts pai --pai_token $(pai_token) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $(docker_image) --pai_storage_config_name $(pai_storage_config_name) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) --vc $(virtual_cluster) python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai - displayName: 'Examples and advanced features tests on pai' \ No newline at end of file + displayName: 'Examples and advanced features tests on pai' diff --git a/test/pipelines/pipelines-it-remote-windows-to-linux.yml b/test/pipelines/pipelines-it-remote-windows-to-linux.yml index 36a98a9819..d87230201a 100644 --- a/test/pipelines/pipelines-it-remote-windows-to-linux.yml +++ b/test/pipelines/pipelines-it-remote-windows-to-linux.yml @@ -16,7 +16,7 @@ jobs: powershell.exe -file install.ps1 displayName: 'Install nni toolkit via source code' - script: | - python -m pip install scikit-learn==0.20.1 --user + python -m pip install scikit-learn==0.23.2 --user displayName: 'Install dependencies for integration tests' - task: SSH@0 inputs: diff --git a/tools/nni_cmd/common_utils.py b/tools/nni_cmd/common_utils.py index 4166bf034c..2edbf667df 100644 --- a/tools/nni_cmd/common_utils.py +++ b/tools/nni_cmd/common_utils.py @@ -4,7 +4,10 @@ import os import sys import json +import tempfile import socket +import string +import random import ruamel.yaml as yaml import psutil from colorama import Fore @@ -83,3 +86,12 @@ def check_tensorboard_version(): print_error('import tensorboard error!') exit(1) +def generate_temp_dir(): + '''generate a temp folder''' + def generate_folder_name(): + return os.path.join(tempfile.gettempdir(), 'nni', ''.join(random.sample(string.ascii_letters + string.digits, 8))) + temp_dir = generate_folder_name() + while os.path.exists(temp_dir): + temp_dir = generate_folder_name() + os.makedirs(temp_dir) + return temp_dir diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index 631abcf5cc..f6aee3a450 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -245,7 +245,7 @@ def validate(self, data): 'codeDir': setPathCheck('codeDir'), 'command': setType('command', str), 'image': setType('image', str), - 'computeTarget': setType('computeTarget', str) + Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), } } @@ -254,6 +254,9 @@ def validate(self, data): 'subscriptionId': setType('subscriptionId', str), 'resourceGroup': setType('resourceGroup', str), 'workspaceName': setType('workspaceName', str), + 'computeTarget': setType('computeTarget', str), + Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), + Optional('useActiveGpu'): setType('useActiveGpu', bool), } } diff --git a/tools/nni_cmd/config_utils.py b/tools/nni_cmd/config_utils.py index 8cc1dc8ada..e6472ee3de 100644 --- a/tools/nni_cmd/config_utils.py +++ b/tools/nni_cmd/config_utils.py @@ -54,13 +54,13 @@ def __init__(self): self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment') self.experiments = self.read_file() - def add_experiment(self, expId, port, time, file_name, platform, experiment_name): + def add_experiment(self, expId, port, startTime, file_name, platform, experiment_name, endTime='N/A', status='INITIALIZED'): '''set {key:value} paris to self.experiment''' self.experiments[expId] = {} self.experiments[expId]['port'] = port - self.experiments[expId]['startTime'] = time - self.experiments[expId]['endTime'] = 'N/A' - self.experiments[expId]['status'] = 'INITIALIZED' + self.experiments[expId]['startTime'] = startTime + self.experiments[expId]['endTime'] = endTime + self.experiments[expId]['status'] = status self.experiments[expId]['fileName'] = file_name self.experiments[expId]['platform'] = platform self.experiments[expId]['experimentName'] = experiment_name diff --git a/tools/nni_cmd/constants.py b/tools/nni_cmd/constants.py index 5a37c3a1f1..0654473ed4 100644 --- a/tools/nni_cmd/constants.py +++ b/tools/nni_cmd/constants.py @@ -6,6 +6,8 @@ NNICTL_HOME_DIR = os.path.join(os.path.expanduser('~'), '.local', 'nnictl') +NNI_HOME_DIR = os.path.join(os.path.expanduser('~'), 'nni-experiments') + ERROR_INFO = 'ERROR: ' NORMAL_INFO = 'INFO: ' WARNING_INFO = 'WARNING: ' diff --git a/tools/nni_cmd/nnictl.py b/tools/nni_cmd/nnictl.py index 6a2991fe50..213554d5e8 100644 --- a/tools/nni_cmd/nnictl.py +++ b/tools/nni_cmd/nnictl.py @@ -11,7 +11,8 @@ from .nnictl_utils import stop_experiment, trial_ls, trial_kill, list_experiment, experiment_status,\ log_trial, experiment_clean, platform_clean, experiment_list, \ monitor_experiment, export_trials_data, trial_codegen, webui_url, \ - get_config, log_stdout, log_stderr, search_space_auto_gen, webui_nas + get_config, log_stdout, log_stderr, search_space_auto_gen, webui_nas, \ + save_experiment, load_experiment from .package_management import package_install, package_uninstall, package_show, package_list from .constants import DEFAULT_REST_PORT from .tensorboard_utils import start_tensorboard, stop_tensorboard @@ -102,6 +103,8 @@ def parse_args(): parser_trial_subparsers = parser_trial.add_subparsers() parser_trial_ls = parser_trial_subparsers.add_parser('ls', help='list trial jobs') parser_trial_ls.add_argument('id', nargs='?', help='the id of experiment') + parser_trial_ls.add_argument('--head', type=int, help='list the highest experiments on the default metric') + parser_trial_ls.add_argument('--tail', type=int, help='list the lowest experiments on the default metric') parser_trial_ls.set_defaults(func=trial_ls) parser_trial_kill = parser_trial_subparsers.add_parser('kill', help='kill trial jobs') parser_trial_kill.add_argument('id', nargs='?', help='the id of experiment') @@ -129,15 +132,6 @@ def parse_args(): parser_experiment_clean.add_argument('id', nargs='?', help='the id of experiment') parser_experiment_clean.add_argument('--all', action='store_true', default=False, help='delete all of experiments') parser_experiment_clean.set_defaults(func=experiment_clean) - - #parse experiment command - parser_platform = subparsers.add_parser('platform', help='get platform information') - #add subparsers for parser_experiment - parser_platform_subparsers = parser_platform.add_subparsers() - parser_platform_clean = parser_platform_subparsers.add_parser('clean', help='clean up the platform data') - parser_platform_clean.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file') - parser_platform_clean.set_defaults(func=platform_clean) - #import tuning data parser_import_data = parser_experiment_subparsers.add_parser('import', help='import additional data') parser_import_data.add_argument('id', nargs='?', help='the id of experiment') @@ -148,7 +142,32 @@ def parse_args(): parser_trial_export.add_argument('id', nargs='?', help='the id of experiment') parser_trial_export.add_argument('--type', '-t', choices=['json', 'csv'], required=True, dest='type', help='target file type') parser_trial_export.add_argument('--filename', '-f', required=True, dest='path', help='target file path') + parser_trial_export.add_argument('--intermediate', '-i', action='store_true', + default=False, help='are intermediate results included') parser_trial_export.set_defaults(func=export_trials_data) + #save an NNI experiment + parser_save_experiment = parser_experiment_subparsers.add_parser('save', help='save an experiment') + parser_save_experiment.add_argument('id', nargs='?', help='the id of experiment') + parser_save_experiment.add_argument('--path', '-p', required=False, help='the folder path to store nni experiment data, \ + default current working directory') + parser_save_experiment.add_argument('--saveCodeDir', '-s', action='store_true', default=False, help='save codeDir data \ + of the experiment') + parser_save_experiment.set_defaults(func=save_experiment) + #load an NNI experiment + parser_load_experiment = parser_experiment_subparsers.add_parser('load', help='load an experiment') + parser_load_experiment.add_argument('--path', '-p', required=True, help='the path of nni package file') + parser_load_experiment.add_argument('--codeDir', '-c', required=True, help='the path of codeDir for loaded experiment, \ + this path will also put the code in the loaded experiment package') + parser_load_experiment.add_argument('--logDir', '-l', required=False, help='the path of logDir for loaded experiment') + parser_load_experiment.set_defaults(func=load_experiment) + + #parse platform command + parser_platform = subparsers.add_parser('platform', help='get platform information') + #add subparsers for parser_platform + parser_platform_subparsers = parser_platform.add_subparsers() + parser_platform_clean = parser_platform_subparsers.add_parser('clean', help='clean up the platform data') + parser_platform_clean.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file') + parser_platform_clean.set_defaults(func=platform_clean) #TODO:finish webui function #parse board command diff --git a/tools/nni_cmd/nnictl_utils.py b/tools/nni_cmd/nnictl_utils.py index bbbf54fcc6..3fad64b1fc 100644 --- a/tools/nni_cmd/nnictl_utils.py +++ b/tools/nni_cmd/nnictl_utils.py @@ -9,6 +9,7 @@ import re import shutil import subprocess +from functools import cmp_to_key from datetime import datetime, timezone from pathlib import Path from subprocess import Popen @@ -16,11 +17,11 @@ from nni.package_utils import get_nni_installation_path from nni_annotation import expand_annotations from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response -from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url +from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url, metric_data_url from .config_utils import Config, Experiments -from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \ +from .constants import NNICTL_HOME_DIR, NNI_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \ EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT -from .common_utils import print_normal, print_error, print_warning, detect_process, get_yml_content +from .common_utils import print_normal, print_error, print_warning, detect_process, get_yml_content, generate_temp_dir from .command_utils import check_output_command, kill_command from .ssh_utils import create_ssh_sftp_client, remove_remote_directory @@ -248,6 +249,20 @@ def stop_experiment(args): def trial_ls(args): '''List trial''' + def final_metric_data_cmp(lhs, rhs): + metric_l = json.loads(json.loads(lhs['finalMetricData'][0]['data'])) + metric_r = json.loads(json.loads(rhs['finalMetricData'][0]['data'])) + if isinstance(metric_l, float): + return metric_l - metric_r + elif isinstance(metric_l, dict): + return metric_l['default'] - metric_r['default'] + else: + print_error('Unexpected data format. Please check your data.') + raise ValueError + + if args.head and args.tail: + print_error('Head and tail cannot be set at the same time.') + return nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') @@ -259,6 +274,14 @@ def trial_ls(args): response = rest_get(trial_jobs_url(rest_port), REST_TIME_OUT) if response and check_response(response): content = json.loads(response.text) + if args.head: + assert args.head > 0, 'The number of requested data must be greater than 0.' + content = sorted(filter(lambda x: 'finalMetricData' in x, content), + key=cmp_to_key(final_metric_data_cmp), reverse=True)[:args.head] + elif args.tail: + assert args.tail > 0, 'The number of requested data must be greater than 0.' + content = sorted(filter(lambda x: 'finalMetricData' in x, content), + key=cmp_to_key(final_metric_data_cmp))[:args.tail] for index, value in enumerate(content): content[index] = convert_time_stamp_to_date(value) print(json.dumps(content, indent=4, sort_keys=True, separators=(',', ':'))) @@ -681,45 +704,64 @@ def monitor_experiment(args): set_monitor(False, args.time) def export_trials_data(args): - '''export experiment metadata to csv + '''export experiment metadata and intermediate results to json or csv ''' + def groupby_trial_id(intermediate_results): + sorted(intermediate_results, key=lambda x: x['timestamp']) + groupby = dict() + for content in intermediate_results: + groupby.setdefault(content['trialJobId'], []).append(json.loads(content['data'])) + return groupby + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') + if not detect_process(rest_pid): print_error('Experiment is not running...') return running, response = check_rest_server_quick(rest_port) - if running: - response = rest_get(export_data_url(rest_port), 20) - if response is not None and check_response(response): - if args.type == 'json': - with open(args.path, 'w') as file: - file.write(response.text) - elif args.type == 'csv': - content = json.loads(response.text) - trial_records = [] - for record in content: - record_value = json.loads(record['value']) - if not isinstance(record_value, (float, int)): - formated_record = {**record['parameter'], **record_value, **{'id': record['id']}} - else: - formated_record = {**record['parameter'], **{'reward': record_value, 'id': record['id']}} - trial_records.append(formated_record) - if not trial_records: - print_error('No trial results collected! Please check your trial log...') - exit(0) - with open(args.path, 'w', newline='') as file: - writer = csv.DictWriter(file, set.union(*[set(r.keys()) for r in trial_records])) - writer.writeheader() - writer.writerows(trial_records) - else: - print_error('Unknown type: %s' % args.type) - exit(1) + if not running: + print_error('Restful server is not running') + return + response = rest_get(export_data_url(rest_port), 20) + if response is not None and check_response(response): + content = json.loads(response.text) + if args.intermediate: + intermediate_results_response = rest_get(metric_data_url(rest_port), REST_TIME_OUT) + if not intermediate_results_response or not check_response(intermediate_results_response): + print_error('Error getting intermediate results.') + return + intermediate_results = groupby_trial_id(json.loads(intermediate_results_response.text)) + for record in content: + record['intermediate'] = intermediate_results[record['id']] + if args.type == 'json': + with open(args.path, 'w') as file: + file.write(json.dumps(content)) + elif args.type == 'csv': + trial_records = [] + for record in content: + formated_record = dict() + if args.intermediate: + formated_record['intermediate'] = '[' + ','.join(record['intermediate']) + ']' + record_value = json.loads(record['value']) + if not isinstance(record_value, (float, int)): + formated_record.update({**record['parameter'], **record_value, **{'id': record['id']}}) + else: + formated_record.update({**record['parameter'], **{'reward': record_value, 'id': record['id']}}) + trial_records.append(formated_record) + if not trial_records: + print_error('No trial results collected! Please check your trial log...') + exit(0) + with open(args.path, 'w', newline='') as file: + writer = csv.DictWriter(file, set.union(*[set(r.keys()) for r in trial_records])) + writer.writeheader() + writer.writerows(trial_records) else: - print_error('Export failed...') + print_error('Unknown type: %s' % args.type) + return else: - print_error('Restful server is not Running') + print_error('Export failed...') def search_space_auto_gen(args): '''dry run trial code to generate search space file''' @@ -736,3 +778,166 @@ def search_space_auto_gen(args): print_warning('Expected search space file \'{}\' generated, but not found.'.format(file_path)) else: print_normal('Generate search space done: \'{}\'.'.format(file_path)) + +def save_experiment(args): + '''save experiment data to a zip file''' + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + if args.id is None: + print_error('Please set experiment id.') + exit(1) + if args.id not in experiment_dict: + print_error('Cannot find experiment {0}.'.format(args.id)) + exit(1) + if experiment_dict[args.id].get('status') != 'STOPPED': + print_error('Can only save stopped experiment!') + exit(1) + print_normal('Saving...') + nni_config = Config(experiment_dict[args.id]['fileName']) + logDir = os.path.join(NNI_HOME_DIR, args.id) + if nni_config.get_config('logDir'): + logDir = os.path.join(nni_config.get_config('logDir'), args.id) + temp_root_dir = generate_temp_dir() + + # Step1. Copy logDir to temp folder + if not os.path.exists(logDir): + print_error('logDir: %s does not exist!' % logDir) + exit(1) + temp_experiment_dir = os.path.join(temp_root_dir, 'experiment') + shutil.copytree(logDir, temp_experiment_dir) + + # Step2. Copy nnictl metadata to temp folder + temp_nnictl_dir = os.path.join(temp_root_dir, 'nnictl') + os.makedirs(temp_nnictl_dir, exist_ok=True) + try: + with open(os.path.join(temp_nnictl_dir, '.experiment'), 'w') as file: + experiment_dict[args.id]['id'] = args.id + json.dump(experiment_dict[args.id], file) + except IOError: + print_error('Write file to %s failed!' % os.path.join(temp_nnictl_dir, '.experiment')) + exit(1) + nnictl_config_dir = os.path.join(NNICTL_HOME_DIR, experiment_dict[args.id]['fileName']) + shutil.copytree(nnictl_config_dir, os.path.join(temp_nnictl_dir, experiment_dict[args.id]['fileName'])) + + # Step3. Copy code dir + if args.saveCodeDir: + temp_code_dir = os.path.join(temp_root_dir, 'code') + shutil.copytree(nni_config.get_config('experimentConfig')['trial']['codeDir'], temp_code_dir) + + # Step4. Archive folder + zip_package_name = 'nni_experiment_%s' % args.id + if args.path: + os.makedirs(args.path, exist_ok=True) + zip_package_name = os.path.join(args.path, zip_package_name) + shutil.make_archive(zip_package_name, 'zip', temp_root_dir) + print_normal('Save to %s.zip success!' % zip_package_name) + + # Step5. Cleanup temp data + shutil.rmtree(temp_root_dir) + +def load_experiment(args): + '''load experiment data''' + package_path = os.path.expanduser(args.path) + if not os.path.exists(args.path): + print_error('file path %s does not exist!' % args.path) + exit(1) + temp_root_dir = generate_temp_dir() + shutil.unpack_archive(package_path, temp_root_dir) + print_normal('Loading...') + # Step1. Validation + if not os.path.exists(args.codeDir): + print_error('Invalid: codeDir path does not exist!') + exit(1) + if args.logDir: + if not os.path.exists(args.logDir): + print_error('Invalid: logDir path does not exist!') + exit(1) + experiment_temp_dir = os.path.join(temp_root_dir, 'experiment') + if not os.path.exists(os.path.join(experiment_temp_dir, 'db')): + print_error('Invalid archive file: db file does not exist!') + shutil.rmtree(temp_root_dir) + exit(1) + nnictl_temp_dir = os.path.join(temp_root_dir, 'nnictl') + if not os.path.exists(os.path.join(nnictl_temp_dir, '.experiment')): + print_error('Invalid archive file: nnictl metadata file does not exist!') + shutil.rmtree(temp_root_dir) + exit(1) + try: + with open(os.path.join(nnictl_temp_dir, '.experiment'), 'r') as file: + experiment_metadata = json.load(file) + except ValueError as err: + print_error('Invalid nnictl metadata file: %s' % err) + shutil.rmtree(temp_root_dir) + exit(1) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + experiment_id = experiment_metadata.get('id') + if experiment_id in experiment_dict: + print_error('Invalid: experiment id already exist!') + shutil.rmtree(temp_root_dir) + exit(1) + if not os.path.exists(os.path.join(nnictl_temp_dir, experiment_metadata.get('fileName'))): + print_error('Invalid: experiment metadata does not exist!') + shutil.rmtree(temp_root_dir) + exit(1) + + # Step2. Copy nnictl metadata + src_path = os.path.join(nnictl_temp_dir, experiment_metadata.get('fileName')) + dest_path = os.path.join(NNICTL_HOME_DIR, experiment_metadata.get('fileName')) + if os.path.exists(dest_path): + shutil.rmtree(dest_path) + shutil.copytree(src_path, dest_path) + + # Step3. Copy experiment data + nni_config = Config(experiment_metadata.get('fileName')) + nnictl_exp_config = nni_config.get_config('experimentConfig') + if args.logDir: + logDir = args.logDir + nnictl_exp_config['logDir'] = logDir + else: + if nnictl_exp_config.get('logDir'): + logDir = nnictl_exp_config['logDir'] + else: + logDir = NNI_HOME_DIR + os.rename(os.path.join(temp_root_dir, 'experiment'), os.path.join(temp_root_dir, experiment_id)) + src_path = os.path.join(os.path.join(temp_root_dir, experiment_id)) + dest_path = os.path.join(os.path.join(logDir, experiment_id)) + if os.path.exists(dest_path): + shutil.rmtree(dest_path) + shutil.copytree(src_path, dest_path) + + # Step4. Copy code dir + codeDir = os.path.expanduser(args.codeDir) + if not os.path.isabs(codeDir): + codeDir = os.path.join(os.getcwd(), codeDir) + print_normal('Expand codeDir to %s' % codeDir) + nnictl_exp_config['trial']['codeDir'] = codeDir + archive_code_dir = os.path.join(temp_root_dir, 'code') + if os.path.exists(archive_code_dir): + file_list = os.listdir(archive_code_dir) + for file_name in file_list: + src_path = os.path.join(archive_code_dir, file_name) + target_path = os.path.join(codeDir, file_name) + if os.path.exists(target_path): + print_error('Copy %s failed, %s exist!' % (file_name, target_path)) + continue + if os.path.isdir(src_path): + shutil.copytree(src_path, target_path) + else: + shutil.copy(src_path, target_path) + + # Step5. Create experiment metadata + nni_config.set_config('experimentConfig', nnictl_exp_config) + experiment_config.add_experiment(experiment_id, + experiment_metadata.get('port'), + experiment_metadata.get('startTime'), + experiment_metadata.get('fileName'), + experiment_metadata.get('platform'), + experiment_metadata.get('experimentName'), + experiment_metadata.get('endTime'), + experiment_metadata.get('status')) + print_normal('Load experiment %s succsss!' % experiment_id) + + # Step6. Cleanup temp data + shutil.rmtree(temp_root_dir) + diff --git a/tools/nni_cmd/updater.py b/tools/nni_cmd/updater.py index 13ee679c49..c9991b8bab 100644 --- a/tools/nni_cmd/updater.py +++ b/tools/nni_cmd/updater.py @@ -14,7 +14,7 @@ def validate_digit(value, start, end): '''validate if a digit is valid''' if not str(value).isdigit() or int(value) < start or int(value) > end: - raise ValueError('%s must be a digit from %s to %s' % (value, start, end)) + raise ValueError('value (%s) must be a digit from %s to %s' % (value, start, end)) def validate_file(path): '''validate if a file exist''' diff --git a/tools/nni_cmd/url_utils.py b/tools/nni_cmd/url_utils.py index 083a865d65..59a28837a6 100644 --- a/tools/nni_cmd/url_utils.py +++ b/tools/nni_cmd/url_utils.py @@ -22,6 +22,11 @@ TENSORBOARD_API = '/tensorboard' +METRIC_DATA_API = '/metric-data' + +def metric_data_url(port): + '''get metric_data url''' + return '{0}:{1}{2}{3}'.format(BASE_URL, port, API_ROOT_URL, METRIC_DATA_API) def check_status_url(port): '''get check_status url'''