Skip to content

Commit

Permalink
add NO_MORE_TRIAL state in experiment (microsoft#389)
Browse files Browse the repository at this point in the history
  • Loading branch information
QuanluZhang authored Nov 25, 2018
1 parent c4d1aef commit e577baf
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
2 changes: 1 addition & 1 deletion src/nni_manager/common/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ interface TrialJobStatistics {
}

interface NNIManagerStatus {
status: 'INITIALIZED' | 'EXPERIMENT_RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE';
status: 'INITIALIZED' | 'EXPERIMENT_RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL';
errors: string[];
}

Expand Down
25 changes: 19 additions & 6 deletions src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,9 @@ class NNIManager implements Manager {
this.status.status = 'EXPERIMENT_RUNNING';

// TO DO: update database record for resume event
this.run().catch(console.error);
this.run().catch((err: Error) => {
this.criticalError(err);
});
}

public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
Expand Down Expand Up @@ -440,10 +442,16 @@ class NNIManager implements Manager {
}

// check maxtrialnum and maxduration here
// NO_MORE_TRIAL is more like a subset of EXPERIMENT_RUNNING, because during EXPERIMENT_RUNNING tuner
// might tell nnimanager that this is no more trials. In NO_MORE_TRIAL state, the experiment is viewed
// as still running. DONE could be transfered from EXPERIMENT_RUNNING or NO_MORE_TRIAL.
assert(this.status.status === 'EXPERIMENT_RUNNING' ||
this.status.status === 'DONE' ||
this.status.status === 'NO_MORE_TRIAL');
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
assert(this.status.status === 'EXPERIMENT_RUNNING' || this.status.status === 'DONE');
if (this.status.status === 'EXPERIMENT_RUNNING') {
if (this.status.status === 'EXPERIMENT_RUNNING' ||
this.status.status === 'NO_MORE_TRIAL') {
this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile();
}
Expand All @@ -453,7 +461,9 @@ class NNIManager implements Manager {
delete this.experimentProfile.endTime;
await this.storeExperimentProfile();
}
this.status.status = 'EXPERIMENT_RUNNING';
if (this.status.status !== 'NO_MORE_TRIAL') {
this.status.status = 'EXPERIMENT_RUNNING';
}
for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
if (this.waitingTrials.length === 0 ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
Expand Down Expand Up @@ -572,6 +582,10 @@ class NNIManager implements Manager {
this.requestTrialJobs(this.experimentProfile.params.trialConcurrency);
break;
case NEW_TRIAL_JOB:
if (this.status.status === 'NO_MORE_TRIAL') {
this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set');
this.status.status = 'EXPERIMENT_RUNNING';
}
this.waitingTrials.push(content);
break;
case SEND_TRIAL_JOB_PARAMETER:
Expand All @@ -591,8 +605,7 @@ class NNIManager implements Manager {
'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
break;
case NO_MORE_TRIAL_JOBS:
//this.trialJobsMaintainer.setNoMoreTrials();
// ignore this event for now
this.status.status = 'NO_MORE_TRIAL';
break;
case KILL_TRIAL_JOB:
await this.trainingService.cancelTrialJob(JSON.parse(content), true);
Expand Down

0 comments on commit e577baf

Please sign in to comment.