diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index 78e7aa7984..3787f7165d 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -55,6 +55,32 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod * Optional key. Set the shmMB configuration of OpenPAI, it set the shared memory for one task in the task role. * authFile * Optional key, Set the auth file path for private registry while using PAI mode, [Refer](https://github.com/microsoft/pai/blob/2ea69b45faa018662bc164ed7733f6fdbb4c42b3/docs/faq.md#q-how-to-use-private-docker-registry-job-image-when-submitting-an-openpai-job), you can prepare the authFile and simply provide the local path of this file, NNI will upload this file to HDFS for you. +* portList + * Optional key. Set the portList configuration of OpenPAI, it specifies a list of port used in container, [Refer](https://github.com/microsoft/pai/blob/b2324866d0280a2d22958717ea6025740f71b9f0/docs/job_tutorial.md#specification). + The config schema in NNI is shown below: + ``` + portList: + - label: test + beginAt: 8080 + portNumber: 2 + ``` + Let's say you want to launch a tensorboard in the mnist example using the port. So the first step is to write a wrapper script `launch_pai.sh` of `mnist.py`. + + ```bash + export TENSORBOARD_PORT=PAI_PORT_LIST_${PAI_CURRENT_TASK_ROLE_NAME}_0_tensorboard + tensorboard --logdir . --port ${!TENSORBOARD_PORT} & + python3 mnist.py + ``` + The config file of portList should be filled as following: + + ```yaml + trial: + command: bash launch_pai.sh + portList: + - label: tensorboard + beginAt: 0 + portNumber: 1 + ``` Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command ``` diff --git a/docs/en_US/TrainingService/SupportTrainingService.md b/docs/en_US/TrainingService/SupportTrainingService.md index 50c91173e2..dfb0df3fe8 100644 --- a/docs/en_US/TrainingService/SupportTrainingService.md +++ b/docs/en_US/TrainingService/SupportTrainingService.md @@ -33,4 +33,4 @@ abstract class TrainingService { } ``` The parent class of TrainingService has a few abstract functions, users need to inherit the parent class and implement all of these abstract functions. -For more information about how to write your own TrainingService, please [refer](https://github.com/SparkSnail/nni/blob/dev-trainingServiceDoc/docs/en_US/TrainingService/HowToImplementTrainingService.md). +For more information about how to write your own TrainingService, please [refer](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/HowToImplementTrainingService.md). diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 19f88f11af..1dd688816a 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -53,6 +53,11 @@ export namespace ValidationSchemas { shmMB: joi.number(), authFile: joi.string(), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), + portList: joi.array().items(joi.object({ + label: joi.string().required(), + beginAt: joi.number().required(), + portNumber: joi.number().required(), + })), worker: joi.object({ replicas: joi.number().min(1).required(), image: joi.string().min(1), diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts index 1248368a89..43f95f7f9c 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -39,6 +39,8 @@ export class PAITaskRole { public readonly command: string; //Shared memory for one task in the task role public readonly shmMB?: number; + //portList to specify the port used in container + public portList?: portListMetaData[]; /** * Constructor @@ -50,7 +52,7 @@ export class PAITaskRole { * @param command Executable command for tasks in the task role, can not be empty */ constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number, - command : string, shmMB?: number) { + command : string, shmMB?: number, portList?: portListMetaData[]) { this.name = name; this.taskNumber = taskNumber; this.cpuNumber = cpuNumber; @@ -58,6 +60,7 @@ export class PAITaskRole { this.gpuNumber = gpuNumber; this.command = command; this.shmMB = shmMB; + this.portList = portList; } } @@ -120,6 +123,16 @@ export class PAIClusterConfig { } } +/** + * portList data structure used in PAI taskRole + */ +export class portListMetaData { + public readonly label : string = ''; + public readonly beginAt: number = 0; + public readonly portNumber: number = 0; +} + + /** * PAI trial configuration */ @@ -134,9 +147,11 @@ export class NNIPAITrialConfig extends TrialConfig { public shmMB?: number; //authentication file used for private Docker registry public authFile?: string; + //portList to specify the port used in container + public portList?: portListMetaData[]; constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, - image: string, virtualCluster?: string, shmMB?: number, authFile?: string) { + image: string, virtualCluster?: string, shmMB?: number, authFile?: string, portList?: portListMetaData[]) { super(command, codeDir, gpuNum); this.cpuNum = cpuNum; this.memoryMB = memoryMB; @@ -144,5 +159,6 @@ export class NNIPAITrialConfig extends TrialConfig { this.virtualCluster = virtualCluster; this.shmMB = shmMB; this.authFile = authFile; + this.portList = portList; } } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 91865d906f..ff742f0fc0 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -79,6 +79,7 @@ class PAITrainingService implements TrainingService { private logCollection: string; private isMultiPhase: boolean = false; private authFileHdfsPath: string | undefined = undefined; + private portList?: string | undefined; constructor() { this.log = getLogger(); @@ -446,6 +447,8 @@ class PAITrainingService implements TrainingService { nniPaiTrialCommand, // Task shared memory this.paiTrialConfig.shmMB, + // Task portList + this.paiTrialConfig.portList ) ]; diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index f09786664b..ad13b9684e 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -240,7 +240,12 @@ def setPathCheck(key): Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), Optional('virtualCluster'): setType('virtualCluster', str), - Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode') + Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), + Optional('portList'): [{ + "label": setType('label', str), + "beginAt": setType('beginAt', int), + "portNumber": setType('portNumber', int) + }] } }