DLTS integration (#1945)

* skeleton of dlts training service (#1844) * Hello, DLTS! * Revert version * Remove fs-extra * Add some default cluster config * schema * fix * Optional cluster (default to `.default`) Depends on DLWorkspace#837 * fix * fix * optimize gpu type * No more copy * Format * Code clean up * Issue fix * Add optional fields in config * Issue fix * Lint * Lint * Validate email, password and team * Doc * Doc fix * Set TMPDIR * Use metadata instead of gpu_capacity * Cancel paused DLTS job * workaround lint rules * pylint * doc Co-authored-by: QuanluZhang <[email protected]>
microsoft · Mar 2, 2020 · 134368f · 134368f
1 parent 03cea2b
commit 134368f
Show file tree

Hide file tree

Showing 21 changed files with 848 additions and 7 deletions.
diff --git a/docs/en_US/TrainingService/DLTSMode.md b/docs/en_US/TrainingService/DLTSMode.md
@@ -0,0 +1,49 @@
+**Run an Experiment on DLTS**
+===
+NNI supports running an experiment on [DLTS](https://github.com/microsoft/DLWorkspace.git), called dlts mode. Before starting to use NNI dlts mode, you should have an account to access DLTS dashboard.
+
+## Setup Environment
+
+Step 1. Choose a cluster from DLTS dashboard, ask administrator for the cluster dashboard URL.
+
+![Choose Cluster](../../img/dlts-step1.png)
+
+Step 2. Prepare a NNI config YAML like the following:
+
+```yaml
+# Set this field to "dlts"
+trainingServicePlatform: dlts
+authorName: your_name
+experimentName: auto_mnist
+trialConcurrency: 2
+maxExecDuration: 3h
+maxTrialNum: 100
+searchSpacePath: search_space.json
+useAnnotation: false
+tuner:
+  builtinTunerName: TPE
+  classArgs:
+    optimize_mode: maximize
+trial:
+  command: python3 mnist.py
+  codeDir: .
+  gpuNum: 1
+  image: msranni/nni
+# Configuration to access DLTS
+dltsConfig:
+  dashboard: # Ask administrator for the cluster dashboard URL
+```
+
+Remember to fill the cluster dashboard URL to the last line.
+
+Step 3. Open your working directory of the cluster, paste the NNI config as well as related code to a directory.
+
+![Copy Config](../../img/dlts-step3.png)
+
+Step 4. Submit a NNI manager job to the specified cluster.
+
+![Submit Job](../../img/dlts-step4.png)
+
+Step 5. Go to Endpoints tab of the newly created job, click the Port 40000 link to check trial's information.
+
+![View NNI WebUI](../../img/dlts-step5.png)
diff --git a/docs/en_US/training_services.rst b/docs/en_US/training_services.rst
@@ -9,3 +9,4 @@ Introduction to NNI Training Services
     OpenPAI Yarn Mode<./TrainingService/PaiYarnMode>
     Kubeflow<./TrainingService/KubeflowMode>
     FrameworkController<./TrainingService/FrameworkControllerMode>
+    OpenPAI<./TrainingService/DLTSMode>
diff --git a/docs/img/dlts-step1.png b/docs/img/dlts-step1.png
diff --git a/docs/img/dlts-step3.png b/docs/img/dlts-step3.png
diff --git a/docs/img/dlts-step4.png b/docs/img/dlts-step4.png
diff --git a/docs/img/dlts-step5.png b/docs/img/dlts-step5.png
diff --git a/examples/trials/mnist-tfv1/config_dlts.yml b/examples/trials/mnist-tfv1/config_dlts.yml
@@ -0,0 +1,34 @@
+debug: true
+authorName: default
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 10
+#choice: local, remote, pai
+trainingServicePlatform: dlts
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
+  #SMAC (SMAC should be installed through nnictl)
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+trial:
+  command: python3 mnist.py
+  codeDir: .
+  gpuNum: 1
+  #The docker image to run nni job on dlts
+  image: msranni/nni:latest
+dltsConfig:
+  dashboard: http://azure-eastus-p40-dev1-infra01.eastus.cloudapp.azure.com/
+
+  # The following fields are all optional and could be retrieved from environment
+  # variables if running in DLTS job container.
+
+  # cluster: .default
+  # team: platform
+  # email: [email protected]
+  # password: # Paste from DLTS dashboard
diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts
@@ -26,6 +26,7 @@ import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTr
 import {
     RemoteMachineTrainingService
 } from './training_service/remote_machine/remoteMachineTrainingService';
+import { DLTSTrainingService } from './training_service/dlts/dltsTrainingService';
 
 function initStartupInfo(
     startExpMode: string, resumeExperimentId: string, basePort: number,
@@ -60,6 +61,10 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
         Container.bind(TrainingService)
             .to(FrameworkControllerTrainingService)
             .scope(Scope.Singleton);
+    } else if (platformMode === 'dlts') {
+        Container.bind(TrainingService)
+            .to(DLTSTrainingService)
+            .scope(Scope.Singleton);
     } else {
         throw new Error(`Error: unsupported mode: ${platformMode}`);
     }
@@ -108,7 +113,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
 const port: number = parseInt(strPort, 10);
 
 const mode: string = parseArg(['--mode', '-m']);
-if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn'].includes(mode)) {
+if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts'].includes(mode)) {
     console.log(`FATAL: unknown mode: ${mode}`);
     usage();
     process.exit(1);

diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts
@@ -140,6 +140,15 @@ export namespace ValidationSchemas {
                 }),
                 uploadRetryCount: joi.number().min(1)
             }),
+            dlts_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
+                dashboard: joi.string().min(1),
+
+                cluster: joi.string().min(1),
+                team: joi.string().min(1),
+
+                email: joi.string().min(1),
+                password: joi.string().min(1)
+            }),
             nni_manager_ip: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
                 nniManagerIp: joi.string().min(1)
             })

diff --git a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts
@@ -18,6 +18,7 @@ export enum TrialConfigMetadataKey {
     KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config',
     NNI_MANAGER_IP = 'nni_manager_ip',
     FRAMEWORKCONTROLLER_CLUSTER_CONFIG = 'frameworkcontroller_config',
+    DLTS_CLUSTER_CONFIG = 'dlts_config',
     VERSION_CHECK = 'version_check',
     LOG_COLLECTION = 'log_collection'
 }
diff --git a/src/nni_manager/training_service/dlts/dltsClusterConfig.ts b/src/nni_manager/training_service/dlts/dltsClusterConfig.ts
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+export interface DLTSClusterConfig {
+  dashboard: string;
+
+  cluster: string;
+  team: string;
+
+  email: string;
+  password: string;
+
+  gpuType?: string;
+}
diff --git a/src/nni_manager/training_service/dlts/dltsData.ts b/src/nni_manager/training_service/dlts/dltsData.ts
@@ -0,0 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+export const DLTS_TRIAL_COMMAND_FORMAT: string =
+`export NNI_PLATFORM=dlts NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
+&& cd $NNI_SYS_DIR && sh install_nni.sh \
+&& cd '{6}' && python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' \
+--nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}'`;
diff --git a/src/nni_manager/training_service/dlts/dltsJobConfig.ts b/src/nni_manager/training_service/dlts/dltsJobConfig.ts
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+import { DLTSClusterConfig } from "./dltsClusterConfig";
+
+export class DLTSJobConfig {
+  public readonly team: string;
+  public readonly userName: string;
+  public readonly vcName: string;
+  public readonly gpuType: string;
+  public readonly jobType = "training";
+  public readonly jobtrainingtype = "RegularJob";
+  public readonly ssh = false;
+  public readonly ipython = false;
+  public readonly tensorboard = false;
+  public readonly workPath = '';
+  public readonly enableworkpath = true;
+  public readonly dataPath = '';
+  public readonly enabledatapath = false;
+  public readonly jobPath = '';
+  public readonly enablejobpath = true;
+  public readonly mountpoints = [];
+  public readonly env = [{ name: 'TMPDIR', value: '$HOME/tmp' }]
+  public readonly hostNetwork = false;
+  public readonly useGPUTopology = false;
+  public readonly isPrivileged = false;
+  public readonly hostIPC = false;
+  public readonly preemptionAllowed = "False"
+
+  public constructor(
+    clusterConfig: DLTSClusterConfig,
+    public readonly jobName: string,
+    public readonly resourcegpu: number,
+    public readonly image: string,
+    public readonly cmd: string,
+    public readonly interactivePorts: number[],
+  ) {
+    if (clusterConfig.gpuType === undefined) {
+      throw Error('GPU type not fetched')
+    }
+    this.vcName = this.team = clusterConfig.team
+    this.gpuType = clusterConfig.gpuType
+    this.userName = clusterConfig.email
+  }
+}
diff --git a/src/nni_manager/training_service/dlts/dltsJobRestServer.ts b/src/nni_manager/training_service/dlts/dltsJobRestServer.ts
@@ -0,0 +1,77 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+'use strict';
+
+import { Request, Response, Router } from 'express';
+import { Inject } from 'typescript-ioc';
+import * as component from '../../common/component';
+import { ClusterJobRestServer } from '../common/clusterJobRestServer';
+import { DLTSTrainingService } from './dltsTrainingService';
+
+export interface ParameterFileMeta {
+    readonly experimentId: string;
+    readonly trialId: string;
+    readonly filePath: string;
+}
+
+/**
+ * DLTS Training service Rest server, provides rest API to support DLTS job metrics update
+ *
+ */
+@component.Singleton
+export class DLTSJobRestServer extends ClusterJobRestServer {
+    private parameterFileMetaList: ParameterFileMeta[] = [];
+
+    @Inject
+    private readonly dltsTrainingService: DLTSTrainingService;
+
+    /**
+     * constructor to provide NNIRestServer's own rest property, e.g. port
+     */
+    constructor() {
+        super();
+        this.dltsTrainingService = component.get(DLTSTrainingService);
+    }
+
+    // tslint:disable-next-line:no-any
+    protected handleTrialMetrics(jobId: string, metrics: any[]): void {
+        // Split metrics array into single metric, then emit
+        // Warning: If not split metrics into single ones, the behavior will be UNKNOWN
+        for (const singleMetric of metrics) {
+            this.dltsTrainingService.MetricsEmitter.emit('metric', {
+                id : jobId,
+                data : singleMetric
+            });
+        }
+    }
+
+    protected createRestHandler(): Router {
+        const router: Router = super.createRestHandler();
+
+        router.post(`/parameter-file-meta`, (req: Request, res: Response) => {
+            try {
+                this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`);
+                this.parameterFileMetaList.push(req.body);
+                res.send();
+            } catch (err) {
+                this.log.error(`POST parameter-file-meta error: ${err}`);
+                res.status(500);
+                res.send(err.message);
+            }
+        });
+
+        router.get(`/parameter-file-meta`, (req: Request, res: Response) => {
+            try {
+                this.log.info(`GET /parameter-file-meta`);
+                res.send(this.parameterFileMetaList);
+            } catch (err) {
+                this.log.error(`GET parameter-file-meta error: ${err}`);
+                res.status(500);
+                res.send(err.message);
+            }
+        });
+
+        return router;
+    }
+}