diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 7b17c5cc4d..1061ad9ba2 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -83,8 +83,8 @@ export class AMLEnvironmentService extends EnvironmentService { public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { environments.forEach(async (environment) => { const amlClient = (environment as AMLEnvironmentInformation).amlClient; - if (!amlClient) { - throw new Error('AML client not initialized!'); + if (!amlClient) { + return Promise.reject('AML client not initialized!'); } const status = await amlClient.updateStatus(environment.status); switch (status.toUpperCase()) { @@ -99,7 +99,7 @@ export class AMLEnvironmentService extends EnvironmentService { break; case 'FAILED': environment.setFinalStatus('FAILED'); - break; + return Promise.reject(`AML: job ${environment.jobId} is failed!`); case 'STOPPED': case 'STOPPING': environment.setFinalStatus('USER_CANCELED'); diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 0d935a6a37..1a527b26f9 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -121,8 +121,11 @@ export class OpenPaiEnvironmentService extends EnvironmentService { // RUNNING status is set by runner, and ignore waiting status break; case 'SUCCEEDED': + environment.setFinalStatus(jobResponse.state); + break; case 'FAILED': environment.setFinalStatus(jobResponse.state); + deferred.reject(`OpenPAI: job ${environment.jobId} is failed!`); break; case 'STOPPED': case 'STOPPING': diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 156909e129..8d7df06686 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -526,19 +526,20 @@ class TrialDispatcher implements TrainingService { } private async handleStdout(commandData: any): Promise { + const metricPattern: RegExp = /NNISDK_MEb'(?.*a?)'$/gm; const trialLogDir: string = path.join(getExperimentRootDir(), 'trials', commandData["trial"]); mkDirPSync(trialLogDir); const trialLogPath: string = path.join(trialLogDir, 'stdout_log_collection.log'); try { let skipLogging: boolean = false; if (commandData["tag"] === 'trial' && commandData["msg"] !== undefined) { - const message = commandData["msg"]; - const metricsContent: any = message.match(this.NNI_METRICS_PATTERN); - if (metricsContent && metricsContent.groups) { + const message: string = commandData["msg"]; + let metricsContent = metricPattern.exec(message); + while (metricsContent && metricsContent.groups) { const key: string = 'metrics'; const data = metricsContent.groups[key]; - const metricData = JSON.parse('"' + data.split('"').join('\\"') + '"'); - await this.handleMetricData(commandData["trial"], metricData); + await this.handleMetricData(commandData["trial"], data); + metricsContent = metricPattern.exec(message); skipLogging = true; } } diff --git a/tools/nni_trial_tool/base_channel.py b/tools/nni_trial_tool/base_channel.py index c1ce564ba8..b9d3392abc 100644 --- a/tools/nni_trial_tool/base_channel.py +++ b/tools/nni_trial_tool/base_channel.py @@ -57,7 +57,11 @@ def open(self): def close(self): self.is_running = False - self._inner_close() + try: + self._inner_close() + except Exception as err: + # ignore any error on closing + print("error on closing channel: %s" % err) def send(self, command, data): """Send command to Training Service. diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index 752a303cb0..8d3830f80a 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -33,9 +33,9 @@ def _inner_open(self): def _inner_close(self): if self.client is not None: self.client.close() - if self._event_loop.is_running(): - self._event_loop.close() self.client = None + if self._event_loop.is_running(): + self._event_loop.stop() self._event_loop = None def _inner_send(self, message):