This repository has been archived by the owner on Sep 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* skeleton of dlts training service (#1844) * Hello, DLTS! * Revert version * Remove fs-extra * Add some default cluster config * schema * fix * Optional cluster (default to `.default`) Depends on DLWorkspace#837 * fix * fix * optimize gpu type * No more copy * Format * Code clean up * Issue fix * Add optional fields in config * Issue fix * Lint * Lint * Validate email, password and team * Doc * Doc fix * Set TMPDIR * Use metadata instead of gpu_capacity * Cancel paused DLTS job * workaround lint rules * pylint * doc Co-authored-by: QuanluZhang <[email protected]>
- Loading branch information
1 parent
03cea2b
commit 134368f
Showing
21 changed files
with
848 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
**Run an Experiment on DLTS** | ||
=== | ||
NNI supports running an experiment on [DLTS](https://github.com/microsoft/DLWorkspace.git), called dlts mode. Before starting to use NNI dlts mode, you should have an account to access DLTS dashboard. | ||
|
||
## Setup Environment | ||
|
||
Step 1. Choose a cluster from DLTS dashboard, ask administrator for the cluster dashboard URL. | ||
|
||
![Choose Cluster](../../img/dlts-step1.png) | ||
|
||
Step 2. Prepare a NNI config YAML like the following: | ||
|
||
```yaml | ||
# Set this field to "dlts" | ||
trainingServicePlatform: dlts | ||
authorName: your_name | ||
experimentName: auto_mnist | ||
trialConcurrency: 2 | ||
maxExecDuration: 3h | ||
maxTrialNum: 100 | ||
searchSpacePath: search_space.json | ||
useAnnotation: false | ||
tuner: | ||
builtinTunerName: TPE | ||
classArgs: | ||
optimize_mode: maximize | ||
trial: | ||
command: python3 mnist.py | ||
codeDir: . | ||
gpuNum: 1 | ||
image: msranni/nni | ||
# Configuration to access DLTS | ||
dltsConfig: | ||
dashboard: # Ask administrator for the cluster dashboard URL | ||
``` | ||
Remember to fill the cluster dashboard URL to the last line. | ||
Step 3. Open your working directory of the cluster, paste the NNI config as well as related code to a directory. | ||
![Copy Config](../../img/dlts-step3.png) | ||
Step 4. Submit a NNI manager job to the specified cluster. | ||
![Submit Job](../../img/dlts-step4.png) | ||
Step 5. Go to Endpoints tab of the newly created job, click the Port 40000 link to check trial's information. | ||
![View NNI WebUI](../../img/dlts-step5.png) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
debug: true | ||
authorName: default | ||
experimentName: example_mnist | ||
trialConcurrency: 1 | ||
maxExecDuration: 1h | ||
maxTrialNum: 10 | ||
#choice: local, remote, pai | ||
trainingServicePlatform: dlts | ||
searchSpacePath: search_space.json | ||
#choice: true, false | ||
useAnnotation: false | ||
tuner: | ||
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner | ||
#SMAC (SMAC should be installed through nnictl) | ||
builtinTunerName: TPE | ||
classArgs: | ||
#choice: maximize, minimize | ||
optimize_mode: maximize | ||
trial: | ||
command: python3 mnist.py | ||
codeDir: . | ||
gpuNum: 1 | ||
#The docker image to run nni job on dlts | ||
image: msranni/nni:latest | ||
dltsConfig: | ||
dashboard: http://azure-eastus-p40-dev1-infra01.eastus.cloudapp.azure.com/ | ||
|
||
# The following fields are all optional and could be retrieved from environment | ||
# variables if running in DLTS job container. | ||
|
||
# cluster: .default | ||
# team: platform | ||
# email: [email protected] | ||
# password: # Paste from DLTS dashboard |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
14 changes: 14 additions & 0 deletions
14
src/nni_manager/training_service/dlts/dltsClusterConfig.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
export interface DLTSClusterConfig { | ||
dashboard: string; | ||
|
||
cluster: string; | ||
team: string; | ||
|
||
email: string; | ||
password: string; | ||
|
||
gpuType?: string; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
export const DLTS_TRIAL_COMMAND_FORMAT: string = | ||
`export NNI_PLATFORM=dlts NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ | ||
&& cd $NNI_SYS_DIR && sh install_nni.sh \ | ||
&& cd '{6}' && python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' \ | ||
--nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}'`; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
import { DLTSClusterConfig } from "./dltsClusterConfig"; | ||
|
||
export class DLTSJobConfig { | ||
public readonly team: string; | ||
public readonly userName: string; | ||
public readonly vcName: string; | ||
public readonly gpuType: string; | ||
public readonly jobType = "training"; | ||
public readonly jobtrainingtype = "RegularJob"; | ||
public readonly ssh = false; | ||
public readonly ipython = false; | ||
public readonly tensorboard = false; | ||
public readonly workPath = ''; | ||
public readonly enableworkpath = true; | ||
public readonly dataPath = ''; | ||
public readonly enabledatapath = false; | ||
public readonly jobPath = ''; | ||
public readonly enablejobpath = true; | ||
public readonly mountpoints = []; | ||
public readonly env = [{ name: 'TMPDIR', value: '$HOME/tmp' }] | ||
public readonly hostNetwork = false; | ||
public readonly useGPUTopology = false; | ||
public readonly isPrivileged = false; | ||
public readonly hostIPC = false; | ||
public readonly preemptionAllowed = "False" | ||
|
||
public constructor( | ||
clusterConfig: DLTSClusterConfig, | ||
public readonly jobName: string, | ||
public readonly resourcegpu: number, | ||
public readonly image: string, | ||
public readonly cmd: string, | ||
public readonly interactivePorts: number[], | ||
) { | ||
if (clusterConfig.gpuType === undefined) { | ||
throw Error('GPU type not fetched') | ||
} | ||
this.vcName = this.team = clusterConfig.team | ||
this.gpuType = clusterConfig.gpuType | ||
this.userName = clusterConfig.email | ||
} | ||
} |
77 changes: 77 additions & 0 deletions
77
src/nni_manager/training_service/dlts/dltsJobRestServer.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
'use strict'; | ||
|
||
import { Request, Response, Router } from 'express'; | ||
import { Inject } from 'typescript-ioc'; | ||
import * as component from '../../common/component'; | ||
import { ClusterJobRestServer } from '../common/clusterJobRestServer'; | ||
import { DLTSTrainingService } from './dltsTrainingService'; | ||
|
||
export interface ParameterFileMeta { | ||
readonly experimentId: string; | ||
readonly trialId: string; | ||
readonly filePath: string; | ||
} | ||
|
||
/** | ||
* DLTS Training service Rest server, provides rest API to support DLTS job metrics update | ||
* | ||
*/ | ||
@component.Singleton | ||
export class DLTSJobRestServer extends ClusterJobRestServer { | ||
private parameterFileMetaList: ParameterFileMeta[] = []; | ||
|
||
@Inject | ||
private readonly dltsTrainingService: DLTSTrainingService; | ||
|
||
/** | ||
* constructor to provide NNIRestServer's own rest property, e.g. port | ||
*/ | ||
constructor() { | ||
super(); | ||
this.dltsTrainingService = component.get(DLTSTrainingService); | ||
} | ||
|
||
// tslint:disable-next-line:no-any | ||
protected handleTrialMetrics(jobId: string, metrics: any[]): void { | ||
// Split metrics array into single metric, then emit | ||
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN | ||
for (const singleMetric of metrics) { | ||
this.dltsTrainingService.MetricsEmitter.emit('metric', { | ||
id : jobId, | ||
data : singleMetric | ||
}); | ||
} | ||
} | ||
|
||
protected createRestHandler(): Router { | ||
const router: Router = super.createRestHandler(); | ||
|
||
router.post(`/parameter-file-meta`, (req: Request, res: Response) => { | ||
try { | ||
this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`); | ||
this.parameterFileMetaList.push(req.body); | ||
res.send(); | ||
} catch (err) { | ||
this.log.error(`POST parameter-file-meta error: ${err}`); | ||
res.status(500); | ||
res.send(err.message); | ||
} | ||
}); | ||
|
||
router.get(`/parameter-file-meta`, (req: Request, res: Response) => { | ||
try { | ||
this.log.info(`GET /parameter-file-meta`); | ||
res.send(this.parameterFileMetaList); | ||
} catch (err) { | ||
this.log.error(`GET parameter-file-meta error: ${err}`); | ||
res.status(500); | ||
res.send(err.message); | ||
} | ||
}); | ||
|
||
return router; | ||
} | ||
} |
Oops, something went wrong.