Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Add portList config in PAI trainingService #1467

Merged
merged 6 commits into from
Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions docs/en_US/TrainingService/PaiMode.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,32 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod
* Optional key. Set the shmMB configuration of OpenPAI, it set the shared memory for one task in the task role.
* authFile
* Optional key, Set the auth file path for private registry while using PAI mode, [Refer](https://github.com/microsoft/pai/blob/2ea69b45faa018662bc164ed7733f6fdbb4c42b3/docs/faq.md#q-how-to-use-private-docker-registry-job-image-when-submitting-an-openpai-job), you can prepare the authFile and simply provide the local path of this file, NNI will upload this file to HDFS for you.
* portList
* Optional key. Set the portList configuration of OpenPAI, it specifies a list of port used in container, [Refer](https://github.com/microsoft/pai/blob/b2324866d0280a2d22958717ea6025740f71b9f0/docs/job_tutorial.md#specification).
The config schema in NNI is shown below:
```
portList:
- label: test
beginAt: 8080
portNumber: 2
```
Let's say you want to launch a tensorboard in the mnist example using the port. So the first step is to write a wrapper script `launch_pai.sh` of `mnist.py`.

```bash
export TENSORBOARD_PORT=PAI_PORT_LIST_${PAI_CURRENT_TASK_ROLE_NAME}_0_tensorboard
tensorboard --logdir . --port ${!TENSORBOARD_PORT} &
python3 mnist.py
```
The config file of portList should be filled as following:

```yaml
trial:
command: bash launch_pai.sh
portList:
- label: tensorboard
beginAt: 0
portNumber: 1
```

Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command
```
Expand Down
2 changes: 1 addition & 1 deletion docs/en_US/TrainingService/SupportTrainingService.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ abstract class TrainingService {
}
```
The parent class of TrainingService has a few abstract functions, users need to inherit the parent class and implement all of these abstract functions.
For more information about how to write your own TrainingService, please [refer](https://github.com/SparkSnail/nni/blob/dev-trainingServiceDoc/docs/en_US/TrainingService/HowToImplementTrainingService.md).
For more information about how to write your own TrainingService, please [refer](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/HowToImplementTrainingService.md).
5 changes: 5 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ export namespace ValidationSchemas {
shmMB: joi.number(),
authFile: joi.string(),
nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
portList: joi.array().items(joi.object({
label: joi.string().required(),
beginAt: joi.number().required(),
portNumber: joi.number().required(),
})),
worker: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
Expand Down
20 changes: 18 additions & 2 deletions src/nni_manager/training_service/pai/paiConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ export class PAITaskRole {
public readonly command: string;
//Shared memory for one task in the task role
public readonly shmMB?: number;
//portList to specify the port used in container
public portList?: portListMetaData[];

/**
* Constructor
Expand All @@ -50,14 +52,15 @@ export class PAITaskRole {
* @param command Executable command for tasks in the task role, can not be empty
*/
constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number,
command : string, shmMB?: number) {
command : string, shmMB?: number, portList?: portListMetaData[]) {
this.name = name;
this.taskNumber = taskNumber;
this.cpuNumber = cpuNumber;
this.memoryMB = memoryMB;
this.gpuNumber = gpuNumber;
this.command = command;
this.shmMB = shmMB;
this.portList = portList;
}
}

Expand Down Expand Up @@ -120,6 +123,16 @@ export class PAIClusterConfig {
}
}

/**
* portList data structure used in PAI taskRole
*/
export class portListMetaData {
public readonly label : string = '';
public readonly beginAt: number = 0;
public readonly portNumber: number = 0;
}


/**
* PAI trial configuration
*/
Expand All @@ -134,15 +147,18 @@ export class NNIPAITrialConfig extends TrialConfig {
public shmMB?: number;
//authentication file used for private Docker registry
public authFile?: string;
//portList to specify the port used in container
public portList?: portListMetaData[];

constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number,
image: string, virtualCluster?: string, shmMB?: number, authFile?: string) {
image: string, virtualCluster?: string, shmMB?: number, authFile?: string, portList?: portListMetaData[]) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
this.virtualCluster = virtualCluster;
this.shmMB = shmMB;
this.authFile = authFile;
this.portList = portList;
}
}
3 changes: 3 additions & 0 deletions src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class PAITrainingService implements TrainingService {
private logCollection: string;
private isMultiPhase: boolean = false;
private authFileHdfsPath: string | undefined = undefined;
private portList?: string | undefined;

constructor() {
this.log = getLogger();
Expand Down Expand Up @@ -446,6 +447,8 @@ class PAITrainingService implements TrainingService {
nniPaiTrialCommand,
// Task shared memory
this.paiTrialConfig.shmMB,
// Task portList
this.paiTrialConfig.portList
)
];

Expand Down
7 changes: 6 additions & 1 deletion tools/nni_cmd/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,12 @@ def setPathCheck(key):
Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
Optional('virtualCluster'): setType('virtualCluster', str),
Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode')
Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
Optional('portList'): [{
"label": setType('label', str),
"beginAt": setType('beginAt', int),
"portNumber": setType('portNumber', int)
}]
}
}

Expand Down