Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge master #74

Merged
merged 4 commits into from
Nov 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/nni_manager/common/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ interface ExperimentParams {
searchSpace: string;
trainingServicePlatform: string;
multiPhase?: boolean;
multiThread?: boolean;
tuner: {
className: string;
builtinTunerName?: string;
Expand Down
6 changes: 5 additions & 1 deletion src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,16 @@ function parseArg(names: string[]): string {
* @param assessor: similiar as tuner
*
*/
function getMsgDispatcherCommand(tuner: any, assessor: any, multiPhase: boolean = false): string {
function getMsgDispatcherCommand(tuner: any, assessor: any, multiPhase: boolean = false, multiThread: boolean = false): string {
let command: string = `python3 -m nni --tuner_class_name ${tuner.className}`;
if (multiPhase) {
command += ' --multi_phase';
}

if (multiThread) {
command += ' --multi_thread';
}

if (tuner.classArgs !== undefined) {
command += ` --tuner_args ${JSON.stringify(JSON.stringify(tuner.classArgs))}`;
}
Expand Down
3 changes: 3 additions & 0 deletions src/nni_manager/core/commands.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const ADD_CUSTOMIZED_TRIAL_JOB = 'AD';
const TRIAL_END = 'EN';
const TERMINATE = 'TE';

const INITIALIZED = 'ID';
const NEW_TRIAL_JOB = 'TR';
const SEND_TRIAL_JOB_PARAMETER = 'SP';
const NO_MORE_TRIAL_JOBS = 'NO';
Expand All @@ -39,6 +40,7 @@ const TUNER_COMMANDS: Set<string> = new Set([
ADD_CUSTOMIZED_TRIAL_JOB,
TERMINATE,

INITIALIZED,
NEW_TRIAL_JOB,
SEND_TRIAL_JOB_PARAMETER,
NO_MORE_TRIAL_JOBS
Expand All @@ -61,6 +63,7 @@ export {
ADD_CUSTOMIZED_TRIAL_JOB,
TRIAL_END,
TERMINATE,
INITIALIZED,
NEW_TRIAL_JOB,
NO_MORE_TRIAL_JOBS,
KILL_TRIAL_JOB,
Expand Down
51 changes: 39 additions & 12 deletions src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ import {
} from '../common/trainingService';
import { delay, getLogDir, getMsgDispatcherCommand } from '../common/utils';
import {
ADD_CUSTOMIZED_TRIAL_JOB, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, REPORT_METRIC_DATA,
REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS,
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
} from './commands';
import { createDispatcherInterface, IpcInterface } from './ipcInterface';

Expand Down Expand Up @@ -127,7 +127,8 @@ class NNIManager implements Manager {
this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
}

const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase);
const dispatcherCommand: string = getMsgDispatcherCommand(
expParams.tuner, expParams.assessor, expParams.multiPhase, expParams.multiThread);
this.log.debug(`dispatcher command: ${dispatcherCommand}`);
this.setupTuner(
//expParams.tuner.tunerCommand,
Expand Down Expand Up @@ -159,7 +160,8 @@ class NNIManager implements Manager {
this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
}

const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase);
const dispatcherCommand: string = getMsgDispatcherCommand(
expParams.tuner, expParams.assessor, expParams.multiPhase, expParams.multiThread);
this.log.debug(`dispatcher command: ${dispatcherCommand}`);
this.setupTuner(
dispatcherCommand,
Expand Down Expand Up @@ -419,16 +421,20 @@ class NNIManager implements Manager {
} else {
this.trialConcurrencyChange = requestTrialNum;
}
for (let i: number = 0; i < requestTrialNum; i++) {

const requestCustomTrialNum: number = Math.min(requestTrialNum, this.customizedTrials.length);
for (let i: number = 0; i < requestCustomTrialNum; i++) {
// ask tuner for more trials
if (this.customizedTrials.length > 0) {
const hyperParams: string | undefined = this.customizedTrials.shift();
this.dispatcher.sendCommand(ADD_CUSTOMIZED_TRIAL_JOB, hyperParams);
} else {
this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1');
}
}

if (requestTrialNum - requestCustomTrialNum > 0) {
this.requestTrialJobs(requestTrialNum - requestCustomTrialNum);
}

// check maxtrialnum and maxduration here
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
Expand Down Expand Up @@ -526,11 +532,9 @@ class NNIManager implements Manager {
if (this.dispatcher === undefined) {
throw new Error('Dispatcher error: tuner has not been setup');
}
// TO DO: we should send INITIALIZE command to tuner if user's tuner needs to run init method in tuner
this.log.debug(`Send tuner command: update search space: ${this.experimentProfile.params.searchSpace}`);
this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, this.experimentProfile.params.searchSpace);
this.log.debug(`Send tuner command: ${this.experimentProfile.params.trialConcurrency}`);
this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(this.experimentProfile.params.trialConcurrency));
this.log.debug(`Send tuner command: INITIALIZE: ${this.experimentProfile.params.searchSpace}`);
// Tuner need to be initialized with search space before generating any hyper parameters
this.dispatcher.sendCommand(INITIALIZE, this.experimentProfile.params.searchSpace);
}

private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
Expand All @@ -541,9 +545,32 @@ class NNIManager implements Manager {
this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
}

private requestTrialJobs(jobNum: number): void {
if (jobNum < 1) {
return;
}
if (this.dispatcher === undefined) {
throw new Error('Dispatcher error: tuner has not been setup');
}
if (this.experimentProfile.params.multiThread) {
// Send multiple requests to ensure multiple hyper parameters are generated in non-blocking way.
// For a single REQUEST_TRIAL_JOBS request, hyper parameters are generated one by one
// sequentially.
for (let i: number = 0; i < jobNum; i++) {
this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1');
}
} else {
this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(jobNum));
}
}

private async onTunerCommand(commandType: string, content: string): Promise<void> {
this.log.info(`Command from tuner: ${commandType}, ${content}`);
switch (commandType) {
case INITIALIZED:
// Tuner is intialized, search space is set, request tuner to generate hyper parameters
this.requestTrialJobs(this.experimentProfile.params.trialConcurrency);
break;
case NEW_TRIAL_JOB:
this.waitingTrials.push(content);
break;
Expand Down
23 changes: 21 additions & 2 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,26 @@ export namespace ValidationSchemas {
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
gpuNum: joi.number().min(0),
command: joi.string().min(1),
worker: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
}),
ps: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
})
}),
pai_config: joi.object({
userName: joi.string().min(1).required(),
Expand Down Expand Up @@ -68,6 +86,7 @@ export namespace ValidationSchemas {
searchSpace: joi.string().required(),
maxExecDuration: joi.number().min(0).required(),
multiPhase: joi.boolean(),
multiThread: joi.boolean(),
tuner: joi.object({
builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'BatchTuner', 'GridSearch'),
codeDir: joi.string(),
Expand Down
35 changes: 32 additions & 3 deletions src/nni_manager/training_service/kubeflow/kubeflowConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,44 @@ export class NFSConfig {
/**
* Trial job configuration for Kubeflow
*/
export class KubeflowTrialConfig extends TrialConfig {
export class KubeflowTrialConfigTemplate {
/** replication number of current role */
public readonly replicas: number;

/** CPU number */
public readonly cpuNum: number;

/** Memory */
public readonly memoryMB: number;

/** Docker image */
public readonly image: string;

/** Trail command */
public readonly command : string;

/** Required GPU number for trial job. The number should be in [0,100] */
public readonly gpuNum : number;

constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string) {
super(command, codeDir, gpuNum);
constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) {
this.replicas = replicas;
this.command = command;
this.gpuNum = gpuNum;
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
}
}

export class KubeflowTrialConfig {
public readonly codeDir: string;
public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;

constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
this.codeDir = codeDir;
this.worker = worker;
this.ps = ps;
}
}
2 changes: 1 addition & 1 deletion src/nni_manager/training_service/kubeflow/kubeflowData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh # Check and install NNI pkg
python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR//trialkeeper_stderr
python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
`

export type KubeflowTFJobType = 'Created' | 'Running' | 'Failed' | 'Succeeded';
Loading