Merge pull request #21 from microsoft/master

pull code
microsoft · Jun 24, 2019 · 1c56fea · 1c56fea
2 parents 1241068 + 97829cc
commit 1c56fea
Show file tree

Hide file tree

Showing 63 changed files with 836 additions and 566 deletions.
diff --git a/docs/en_US/CustomizeTuner.md b/docs/en_US/CustomizeTuner.md
@@ -31,7 +31,7 @@ class CustomizedTuner(Tuner):
     def __init__(self, ...):
         ...
 
-    def receive_trial_result(self, parameter_id, parameters, value):
+    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
     '''
     Receive trial's final result.
     parameter_id: int
@@ -41,7 +41,7 @@ class CustomizedTuner(Tuner):
     # your code implements here.
     ...
 
-    def generate_parameters(self, parameter_id):
+    def generate_parameters(self, parameter_id, **kwargs):
     '''
     Returns a set of trial (hyper-)parameters, as a serializable object
     parameter_id: int
@@ -51,15 +51,15 @@ class CustomizedTuner(Tuner):
     ...
 ```
 
-`receive_trial_result` will receive the `parameter_id, parameters, value` as parameters input. Also, Tuner will receive the `value` object are exactly same value that Trial send.
+`receive_trial_result` will receive the `parameter_id, parameters, value` as parameters input. Also, Tuner will receive the `value` object are exactly same value that Trial send. If `multiPhase` is set to `true` in the experiment configuration file, an additional `trial_job_id` parameter is passed to `receive_trial_result` and `generate_parameters` through the `**kwargs` parameter.
 
 The `your_parameters` return from `generate_parameters` function, will be package as json object by NNI SDK. NNI SDK will unpack json object so the Trial will receive the exact same `your_parameters` from Tuner.
 
 For example:
 If the you implement the `generate_parameters` like this:
 
 ```python
-def generate_parameters(self, parameter_id):
+def generate_parameters(self, parameter_id, **kwargs):
     '''
     Returns a set of trial (hyper-)parameters, as a serializable object
     parameter_id: int

diff --git a/docs/en_US/ExperimentConfig.md b/docs/en_US/ExperimentConfig.md
@@ -150,10 +150,15 @@ machineList:
 
     Note: The maxExecDuration spec set the time of an experiment, not a trial job. If the experiment reach the max duration time, the experiment will not stop, but could not submit new trial jobs any more.
 
+* __versionCheck__
+  * Description
+  
+    NNI will check the version of nniManager process and the version of trialKeeper in remote, pai and kubernetes platform. If you want to disable version check, you could set versionCheck be false. 
+
 * __debug__
   * Description
 
-    NNI will check the version of nniManager process and the version of trialKeeper in remote, pai and kubernetes platform. If you want to disable version check, you could set debug be true.
+    Debug mode will set versionCheck be False and set logLevel be 'debug'
 
 * __maxTrialNum__
   * Description

diff --git a/docs/en_US/MultiPhase.md b/docs/en_US/MultiPhase.md
@@ -38,7 +38,15 @@ To enable multi-phase, you should also add `multiPhase: true` in your experiment
 
 ### Write a tuner that leverages multi-phase:
 
-Before writing a multi-phase tuner, we highly suggest you to go through  [Customize Tuner](https://nni.readthedocs.io/en/latest/Customize_Tuner.html). Different from writing a normal tuner, your tuner needs to inherit from `MultiPhaseTuner` (in nni.multi_phase_tuner). The key difference between `Tuner` and `MultiPhaseTuner` is that the methods in MultiPhaseTuner are aware of additional information, that is, `trial_job_id`. With this information, the tuner could know which trial is requesting a configuration, and which trial is reporting results. This information provides enough flexibility for your tuner to deal with different trials and different phases. For example, you may want to use the trial_job_id parameter of generate_parameters method to generate hyperparameters for a specific trial job.
+Before writing a multi-phase tuner, we highly suggest you to go through  [Customize Tuner](https://nni.readthedocs.io/en/latest/Customize_Tuner.html). Same as writing a normal tuner, your tuner needs to inherit from `Tuner` class. When you enable multi-phase through configuration (set `multiPhase` to true), your tuner will get an additional parameter `trial_job_id` via tuner's following methods:
+```
+generate_parameters
+generate_multiple_parameters
+receive_trial_result
+receive_customized_trial_result
+trial_end
+```
+With this information, the tuner could know which trial is requesting a configuration, and which trial is reporting results. This information provides enough flexibility for your tuner to deal with different trials and different phases. For example, you may want to use the trial_job_id parameter of generate_parameters method to generate hyperparameters for a specific trial job.
 
 Of course, to use your multi-phase tuner, __you should add `multiPhase: true` in your experiment YAML configure file__.
 

diff --git a/docs/en_US/Trials.md b/docs/en_US/Trials.md
@@ -144,7 +144,7 @@ export NNI_TRIAL_SEQ_ID=1
 export MULTI_PHASE=false
 export CUDA_VISIBLE_DEVICES=
 eval python3 mnist.py 2>/home/user_name/nni/experiments/$experiment_id$/trials/$trial_id$/stderr
-echo $? `date +%s000` >/home/user_name/nni/experiments/$experiment_id$/trials/$trial_id$/.nni/state
+echo $? `date +%s%3N` >/home/user_name/nni/experiments/$experiment_id$/trials/$trial_id$/.nni/state
 ```
 
 ### Other Modes

diff --git a/docs/en_US/WebUI.md b/docs/en_US/WebUI.md
@@ -8,6 +8,7 @@ Click the tab "Overview".
 * Support to download the experiment result.
 * Support to export nni-manager and dispatcher log file.
 * If you have any question, you can click "Feedback" to report it.
+* If your experiment have more than 1000 trials, you can change the refresh interval on here.
 
 ![](../img/webui-img/over1.png)
 * See good performance trials.
@@ -58,6 +59,10 @@ Click the tab "Trials Detail" to see the status of the all trials. Specifically:
 
 ![](../img/webui-img/addColumn.png)
 
+* If you want to compare some trials, you can select them and then click "Compare" to see the results.
+
+![](../img/webui-img/compare.png)
+
 * You can use the button named "Copy as python" to copy trial's parameters.
 
 ![](../img/webui-img/copyParameter.png)
@@ -69,6 +74,6 @@ Click the tab "Trials Detail" to see the status of the all trials. Specifically:
 
 * Kill: you can kill a job that status is running.
 * Support to search for a specific trial.
-* Intermediate Result Graph.
+* Intermediate Result Graph: you can see default and other keys in this graph.
 
-![](../img/intermediate.png)
+![](../img/webui-img/intermediate.png)
diff --git a/docs/img/intermediate.png b/docs/img/intermediate.png
diff --git a/docs/img/webui-img/addColumn.png b/docs/img/webui-img/addColumn.png
diff --git a/docs/img/webui-img/compare.png b/docs/img/webui-img/compare.png
diff --git a/docs/img/webui-img/copyParameter.png b/docs/img/webui-img/copyParameter.png
diff --git a/docs/img/webui-img/detail-local.png b/docs/img/webui-img/detail-local.png
diff --git a/docs/img/webui-img/detail-pai.png b/docs/img/webui-img/detail-pai.png
diff --git a/docs/img/webui-img/intermediate.png b/docs/img/webui-img/intermediate.png
diff --git a/docs/img/webui-img/over1.png b/docs/img/webui-img/over1.png
diff --git a/docs/zh_CN/Trials.md b/docs/zh_CN/Trials.md
@@ -149,7 +149,7 @@ export NNI_TRIAL_SEQ_ID=1
 export MULTI_PHASE=false
 export CUDA_VISIBLE_DEVICES=
 eval python3 mnist.py 2>/home/user_name/nni/experiments/$experiment_id$/trials/$trial_id$/stderr
-echo $? `date +%s000` >/home/user_name/nni/experiments/$experiment_id$/trials/$trial_id$/.nni/state
+echo $? `date +%s%3N` >/home/user_name/nni/experiments/$experiment_id$/trials/$trial_id$/.nni/state
 ```
 
 ### 其它模式
@@ -166,4 +166,4 @@ echo $? `date +%s000` >/home/user_name/nni/experiments/$experiment_id$/trials/$t
 * [为 CIFAR 10 分类找到最佳的 optimizer](Cifar10Examples.md)
 * [如何在 NNI 调优 SciKit-learn 的参数](SklearnExamples.md)
 * [在阅读理解上使用自动模型架构搜索。](SquadEvolutionExamples.md)
-* [如何在 NNI 上调优 GBDT](GbdtExample.md)
+* [如何在 NNI 上调优 GBDT](GbdtExample.md)
diff --git a/examples/tuners/ga_customer_tuner/customer_tuner.py b/examples/tuners/ga_customer_tuner/customer_tuner.py
@@ -79,7 +79,7 @@ def __init__(self, optimize_mode, population_size = 32):
         logger.debug('init population done.')
         return
 
-    def generate_parameters(self, parameter_id):
+    def generate_parameters(self, parameter_id, **kwargs):
         """Returns a set of trial graph config, as a serializable object.
         parameter_id : int
         """
@@ -109,7 +109,7 @@ def generate_parameters(self, parameter_id):
         return temp
 
 
-    def receive_trial_result(self, parameter_id, parameters, value):
+    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
         '''
         Record an observation of the objective function
         parameter_id : int

diff --git a/examples/tuners/random_nas_tuner/random_nas_tuner.py b/examples/tuners/random_nas_tuner/random_nas_tuner.py
@@ -49,12 +49,12 @@ def update_search_space(self, search_space):
         self.searchspace_json = search_space
         self.random_state = np.random.RandomState()
 
-    def generate_parameters(self, parameter_id):
+    def generate_parameters(self, parameter_id, **kwargs):
         '''generate
         '''
         return random_archi_generator(self.searchspace_json, self.random_state)
 
-    def receive_trial_result(self, parameter_id, parameters, value):
+    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
         '''receive
         '''
         pass
diff --git a/examples/tuners/weight_sharing/ga_customer_tuner/customer_tuner.py b/examples/tuners/weight_sharing/ga_customer_tuner/customer_tuner.py
@@ -112,7 +112,7 @@ def init_population(self, population_size, graph_max_layer, graph_min_layer):
             population.append(Individual(indiv_id=self.generate_new_id(), graph_cfg=graph_tmp, result=None))
         return population
 
-    def generate_parameters(self, parameter_id):
+    def generate_parameters(self, parameter_id, **kwargs):
         """Returns a set of trial graph config, as a serializable object.
         An example configuration:
         ```json
@@ -196,7 +196,7 @@ def generate_parameters(self, parameter_id):
         logger.debug("trial {} ready".format(indiv.indiv_id))
         return param_json
 
-    def receive_trial_result(self, parameter_id, parameters, value):
+    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
         '''
         Record an observation of the objective function
         parameter_id : int

diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts
@@ -375,7 +375,7 @@ function countFilesRecursively(directory: string, timeoutMilliSeconds?: number):
 }
 
 function validateFileName(fileName: string): boolean {
-    let pattern: string = '^[a-z0-9A-Z\.-_]+$';
+    let pattern: string = '^[a-z0-9A-Z\._-]+$';
     const validateResult = fileName.match(pattern);
     if(validateResult) {
         return true;

diff --git a/src/nni_manager/training_service/common/clusterJobRestServer.ts b/src/nni_manager/training_service/common/clusterJobRestServer.ts
@@ -58,6 +58,10 @@ export abstract class ClusterJobRestServer extends RestServer {
         this.port = basePort + 1;
     }
 
+    get apiRootUrl(): string {
+        return this.API_ROOT_URL;
+    }
+
     public get clusterRestServerPort(): number {
         if (this.port === undefined) {
             throw new Error('PAI Rest server port is undefined');
@@ -87,7 +91,7 @@ export abstract class ClusterJobRestServer extends RestServer {
     protected abstract handleTrialMetrics(jobId : string, trialMetrics : any[]) : void;
 
     // tslint:disable: no-unsafe-any no-any
-    private createRestHandler() : Router {
+    protected createRestHandler() : Router {
         const router: Router = Router();
 
         router.use((req: Request, res: Response, next: any) => {

diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts
@@ -355,7 +355,8 @@ class LocalTrainingService implements TrainingService {
         this.log.info('Stopping local machine training service...');
         this.stopping = true;
         for (const stream of this.jobStreamMap.values()) {
-            stream.destroy();
+            stream.end(0)
+            stream.emit('end')
         }
         if (this.gpuScheduler !== undefined) {
             await this.gpuScheduler.stop();
@@ -372,7 +373,9 @@ class LocalTrainingService implements TrainingService {
                 if (stream === undefined) {
                     throw new Error(`Could not find stream in trial ${trialJob.id}`);
                 }
-                stream.destroy();
+                //Refer https://github.com/Juul/tail-stream/issues/20
+                stream.end(0)
+                stream.emit('end')
                 this.jobStreamMap.delete(trialJob.id);
             }
         }
@@ -507,12 +510,12 @@ class LocalTrainingService implements TrainingService {
             script.push(
                 `cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
                 `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
-                `$NOW_DATE = "$NOW_DATE" + "000"`,
+                `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
                 `Write $LASTEXITCODE " " $NOW_DATE  | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`);
         } else {
             script.push(
                 `eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
-                `echo $? \`date +%s000\` >${path.join(workingDirectory, '.nni', 'state')}`);
+                `echo $? \`date +%s%3N\` >${path.join(workingDirectory, '.nni', 'state')}`);
         }
 
         return script;
@@ -567,7 +570,6 @@ class LocalTrainingService implements TrainingService {
                 buffer = remain;
             }
         });
-
         this.jobStreamMap.set(trialJobDetail.id, stream);
     }
 

diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts
@@ -64,11 +64,11 @@ else
 fi`;
 
 export const PAI_TRIAL_COMMAND_FORMAT: string =
-`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} \
+`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
 && cd $NNI_SYS_DIR && sh install_nni.sh \
-&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}' \
---pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' \
---nni_manager_version '{12}' --log_collection '{13}'`;
+&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \
+--pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \
+--nni_manager_version '{13}' --log_collection '{14}'`;
 
 export const PAI_OUTPUT_DIR_FORMAT: string =
 `hdfs://{0}:9000/`;

diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts
@@ -19,17 +19,26 @@
 
 'use strict';
 
+import { Request, Response, Router } from 'express';
 import { Inject } from 'typescript-ioc';
 import * as component from '../../common/component';
 import { ClusterJobRestServer } from '../common/clusterJobRestServer';
 import { PAITrainingService } from './paiTrainingService';
 
+export interface ParameterFileMeta {
+    readonly experimentId: string;
+    readonly trialId: string;
+    readonly filePath: string;
+}
+
 /**
  * PAI Training service Rest server, provides rest API to support pai job metrics update
  *
  */
 @component.Singleton
 export class PAIJobRestServer extends ClusterJobRestServer {
+    private parameterFileMetaList: ParameterFileMeta[] = [];
+
     @Inject
     private readonly paiTrainingService : PAITrainingService;
 
@@ -52,4 +61,33 @@ export class PAIJobRestServer extends ClusterJobRestServer {
             });
         }
     }
+
+    protected createRestHandler(): Router {
+        const router: Router = super.createRestHandler();
+
+        router.post(`/parameter-file-meta`, (req: Request, res: Response) => {
+            try {
+                this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`);
+                this.parameterFileMetaList.push(req.body);
+                res.send();
+            } catch (err) {
+                this.log.error(`POST parameter-file-meta error: ${err}`);
+                res.status(500);
+                res.send(err.message);
+            }
+        });
+
+        router.get(`/parameter-file-meta`, (req: Request, res: Response) => {
+            try {
+                this.log.info(`GET /parameter-file-meta`);
+                res.send(this.parameterFileMetaList);
+            } catch (err) {
+                this.log.error(`GET parameter-file-meta error: ${err}`);
+                res.status(500);
+                res.send(err.message);
+            }
+        });
+
+        return router;
+    }
 }