Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Support remote training service use reuse mode (#2923)
Browse files Browse the repository at this point in the history
  • Loading branch information
SparkSnail authored Oct 10, 2020
1 parent 1ae8a0d commit 392e55f
Show file tree
Hide file tree
Showing 12 changed files with 390 additions and 9 deletions.
10 changes: 10 additions & 0 deletions docs/en_US/Tutorial/ExperimentConfig.md
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,16 @@ Specifies the pre-command that will be executed before the remote machine execut

__Note__: Because __preCommand__ will execute before other commands each time, it is strongly not recommended to set __preCommand__ that will make changes to system, i.e. `mkdir` or `touch`.

### remoteConfig

Optional field in remote mode. Users could set per machine information in `machineList` field, and set global configuration for remote mode in this field.

#### reuse

Optional. Bool. default: `false`. It's an experimental feature.

If it's true, NNI will reuse remote jobs to run as many as possible trials. It can save time of creating new jobs. User needs to make sure each trial can run independent in same job, for example, avoid loading checkpoint from previous trials.

### kubeflowConfig

#### operator
Expand Down
5 changes: 1 addition & 4 deletions src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@ import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/
import { LocalTrainingService } from './training_service/local/localTrainingService';
import { RouterTrainingService } from './training_service/reusable/routerTrainingService';
import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService';
import {
RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService';
import { DLTSTrainingService } from './training_service/dlts/dltsTrainingService';

function initStartupInfo(
Expand All @@ -43,7 +40,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
.scope(Scope.Singleton);
} else if (platformMode === 'remote') {
Container.bind(TrainingService)
.to(RemoteMachineTrainingService)
.to(RouterTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'pai') {
Container.bind(TrainingService)
Expand Down
3 changes: 3 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ export namespace ValidationSchemas {
}),
nni_manager_ip: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
nniManagerIp: joi.string().min(1)
}),
remote_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
reuse: joi.boolean()
})
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export enum TrialConfigMetadataKey {
MACHINE_LIST = 'machine_list',
LOCAL_CONFIG = 'local_config',
TRIAL_CONFIG = 'trial_config',
REMOTE_CONFIG = 'remote_config',
EXPERIMENT_ID = 'experimentId',
MULTI_PHASE = 'multiPhase',
RANDOM_SCHEDULER = 'random_scheduler',
Expand Down
8 changes: 7 additions & 1 deletion src/nni_manager/training_service/reusable/environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,14 @@ export abstract class EnvironmentService {
public abstract get hasStorageService(): boolean;
public abstract config(key: string, value: string): Promise<void>;
public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void>;
public abstract startEnvironment(environment: EnvironmentInformation): Promise<void>;
public abstract stopEnvironment(environment: EnvironmentInformation): Promise<void>;
public abstract startEnvironment(environment: EnvironmentInformation): Promise<void>;

// It is used to set prefetched environment count, default value is 0 for OpenPAI and AML mode,
// in remote mode, this value is set to the length of machine list.
public get prefetchedEnvironmentCount(): number {
return 0;
}

// It depends on environment pressure and settings
// for example, OpenPAI relies on API calls, and there is an limitation for frequence, so it need to be bigger.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

'use strict';

import * as fs from 'fs';
import * as path from 'path';
import * as component from '../../../common/component';
import { getExperimentId } from '../../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../../common/log';
import { EnvironmentInformation, EnvironmentService } from '../environment';
import {
getExperimentRootDir,
} from '../../../common/utils';
import { TrialConfig } from '../../common/trialConfig';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir } from '../../common/util';
import {
ExecutorManager, RemoteMachineMeta,
} from '../../remote_machine/remoteMachineData';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
import { RemoteMachineEnvironmentInformation } from '../remote/remoteConfig';


@component.Singleton
export class RemoteEnvironmentService extends EnvironmentService {

private readonly initExecutorId = "initConnection";
private readonly machineExecutorManagerMap: Map<RemoteMachineMeta, ExecutorManager>;
private readonly environmentExecutorManagerMap: Map<string, ExecutorManager>;
private readonly remoteMachineMetaOccupiedMap: Map<RemoteMachineMeta, boolean>;
private trialConfig: TrialConfig | undefined;
private readonly log: Logger;
private sshConnectionPromises: any[];
private experimentRootDir: string;
private experimentId: string;

constructor() {
super();
this.experimentId = getExperimentId();
this.environmentExecutorManagerMap = new Map<string, ExecutorManager>();
this.machineExecutorManagerMap = new Map<RemoteMachineMeta, ExecutorManager>();
this.remoteMachineMetaOccupiedMap = new Map<RemoteMachineMeta, boolean>();
this.sshConnectionPromises = [];
this.experimentRootDir = getExperimentRootDir();
this.experimentId = getExperimentId();
this.log = getLogger();
}

public get prefetchedEnvironmentCount(): number {
return this.machineExecutorManagerMap.size;
}

public get environmentMaintenceLoopInterval(): number {
return 5000;
}

public get hasMoreEnvironments(): boolean {
return false;
}

public get hasStorageService(): boolean {
return false;
}

public async config(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value);
break;
case TrialConfigMetadataKey.TRIAL_CONFIG: {
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
// Parse trial config failed, throw Error
if (remoteMachineTrailConfig === undefined) {
throw new Error('trial config parsed failed');
}
// codeDir is not a valid directory, throw Error
if (!fs.lstatSync(remoteMachineTrailConfig.codeDir)
.isDirectory()) {
throw new Error(`codeDir ${remoteMachineTrailConfig.codeDir} is not a directory`);
}
try {
// Validate to make sure codeDir doesn't have too many files
await validateCodeDir(remoteMachineTrailConfig.codeDir);
} catch (error) {
this.log.error(error);
return Promise.reject(new Error(error));
}

this.trialConfig = remoteMachineTrailConfig;
break;
}
default:
this.log.debug(`Remote not support metadata key: '${key}', value: '${value}'`);
}
}

private scheduleMachine(): RemoteMachineMeta | undefined {
for (const [rmMeta, occupied] of this.remoteMachineMetaOccupiedMap) {
if (!occupied) {
this.remoteMachineMetaOccupiedMap.set(rmMeta, true);
return rmMeta;
}
}
return undefined;
}

private async setupConnections(machineList: string): Promise<void> {
this.log.debug(`Connecting to remote machines: ${machineList}`);
//TO DO: verify if value's format is wrong, and json parse failed, how to handle error
const rmMetaList: RemoteMachineMeta[] = <RemoteMachineMeta[]>JSON.parse(machineList);

for (const rmMeta of rmMetaList) {
this.sshConnectionPromises.push(await this.initRemoteMachineOnConnected(rmMeta));
}
}

private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta): Promise<void> {
const executorManager: ExecutorManager = new ExecutorManager(rmMeta);
this.log.info(`connecting to ${rmMeta.username}@${rmMeta.ip}:${rmMeta.port}`);
const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId);
this.log.debug(`reached ${executor.name}`);
this.machineExecutorManagerMap.set(rmMeta, executorManager);
this.log.debug(`initializing ${executor.name}`);

// Create root working directory after executor is ready
const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni');
await executor.createFolder(executor.getRemoteExperimentRootDir(getExperimentId()));

// the directory to store temp scripts in remote machine
const remoteGpuScriptCollectorDir: string = executor.getRemoteScriptsPath(getExperimentId());

// clean up previous result.
await executor.createFolder(remoteGpuScriptCollectorDir, true);
await executor.allowPermission(true, nniRootDir);
}

private async refreshEnvironment(environment: EnvironmentInformation): Promise<void> {
const executor = await this.getExecutor(environment.id);
const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
const runnerReturnCodeFilePath: string = `${environment.runnerWorkingFolder}/code`;
if (fs.existsSync(jobpidPath)) {
/* eslint-disable require-atomic-updates */
try {
const isAlive = await executor.isProcessAlive(jobpidPath);
// if the process of jobpid is not alive any more
if (!isAlive) {
const remoteEnvironment: RemoteMachineEnvironmentInformation = environment as RemoteMachineEnvironmentInformation;
if (remoteEnvironment.rmMachineMeta === undefined) {
throw new Error(`${remoteEnvironment.id} machine meta not initialized!`);
}
this.log.info(`pid in ${remoteEnvironment.rmMachineMeta.ip}:${jobpidPath} is not alive!`);
if (fs.existsSync(runnerReturnCodeFilePath)) {
const runnerReturnCode: string = await executor.getRemoteFileContent(runnerReturnCodeFilePath);
const match: RegExpMatchArray | null = runnerReturnCode.trim()
.match(/^-?(\d+)\s+(\d+)$/);
if (match !== null) {
const { 1: code } = match;
// Update trial job's status based on result code
if (parseInt(code, 10) === 0) {
environment.setStatus('SUCCEEDED');
} else {
environment.setStatus('FAILED');
}
this.releaseEnvironmentResource(environment);
}
}
}
} catch (error) {
this.releaseEnvironmentResource(environment);
this.log.error(`Update job status exception, error is ${error.message}`);
}
}
}

public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
const tasks: Promise<void>[] = [];
environments.forEach(async (environment) => {
tasks.push(this.refreshEnvironment(environment));
});
await Promise.all(tasks);
}

/**
* If a environment is finished, release the connection resource
* @param environment remote machine environment job detail
*/
private releaseEnvironmentResource(environment: EnvironmentInformation): void {
const executorManager = this.environmentExecutorManagerMap.get(environment.id);
if (executorManager === undefined) {
throw new Error(`ExecutorManager is not assigned for environment ${environment.id}`);
}

// Note, it still keep reference in trialExecutorManagerMap, as there may be following requests from nni manager.
executorManager.releaseExecutor(environment.id);
const remoteEnvironment: RemoteMachineEnvironmentInformation = environment as RemoteMachineEnvironmentInformation;
if (remoteEnvironment.rmMachineMeta === undefined) {
throw new Error(`${remoteEnvironment.id} rmMachineMeta not initialized!`);
}
this.remoteMachineMetaOccupiedMap.set(remoteEnvironment.rmMachineMeta, false);
}

public async startEnvironment(environment: EnvironmentInformation): Promise<void> {
if (this.sshConnectionPromises.length > 0) {
await Promise.all(this.sshConnectionPromises);
this.log.info('ssh connection initialized!');
// set sshConnectionPromises to [] to avoid log information duplicated
this.sshConnectionPromises = [];
if (this.trialConfig === undefined) {
throw new Error("trial config not initialized!");
}
Array.from(this.machineExecutorManagerMap.keys()).forEach(rmMeta => {
// initialize remoteMachineMetaOccupiedMap, false means not occupied
this.remoteMachineMetaOccupiedMap.set(rmMeta, false);
});
}
const remoteEnvironment: RemoteMachineEnvironmentInformation = environment as RemoteMachineEnvironmentInformation;
remoteEnvironment.status = 'WAITING';
// schedule machine for environment, generate command
await this.prepareEnvironment(remoteEnvironment);
// launch runner process in machine
await this.launchRunner(environment);
}

private async prepareEnvironment(environment: RemoteMachineEnvironmentInformation): Promise<boolean> {
if (this.trialConfig === undefined) {
throw new Error('trial config is not initialized');
}

// get an executor from scheduler
const rmMachineMeta: RemoteMachineMeta | undefined = this.scheduleMachine();
if (rmMachineMeta === undefined) {
this.log.warning(`No available machine!`);
return Promise.resolve(false);
} else {
environment.rmMachineMeta = rmMachineMeta;
const executorManager: ExecutorManager | undefined = this.machineExecutorManagerMap.get(environment.rmMachineMeta);
if (executorManager === undefined) {
throw new Error(`executorManager not initialized`);
}
this.environmentExecutorManagerMap.set(environment.id, executorManager);
const executor = await this.getExecutor(environment.id);
environment.runnerWorkingFolder =
executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()),
'envs', environment.id)
environment.command = `cd ${environment.runnerWorkingFolder} && \
${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \
&& echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`;
return Promise.resolve(true);
}
}

private async launchRunner(environment: RemoteMachineEnvironmentInformation): Promise<void> {
if (this.trialConfig === undefined) {
throw new Error('trial config is not initialized');
}
const executor = await this.getExecutor(environment.id);
const environmentLocalTempFolder: string =
path.join(this.experimentRootDir, this.experimentId, "environment-temp")
await executor.createFolder(environment.runnerWorkingFolder);
await execMkdir(environmentLocalTempFolder);
await fs.promises.writeFile(path.join(environmentLocalTempFolder, executor.getScriptName("run")),
environment.command, { encoding: 'utf8' });
// Copy files in codeDir to remote working directory
await executor.copyDirectoryToRemote(environmentLocalTempFolder, environment.runnerWorkingFolder);
// Execute command in remote machine
executor.executeScript(executor.joinPath(environment.runnerWorkingFolder,
executor.getScriptName("run")), true, false);
environment.status = 'RUNNING';
if (environment.rmMachineMeta === undefined) {
throw new Error(`${environment.id} rmMachineMeta not initialized!`);
}
environment.trackingUrl = `file://${environment.rmMachineMeta.ip}:${environment.runnerWorkingFolder}`;
}

private async getExecutor(environmentId: string): Promise<ShellExecutor> {
const executorManager = this.environmentExecutorManagerMap.get(environmentId);
if (executorManager === undefined) {
throw new Error(`ExecutorManager is not assigned for environment ${environmentId}`);
}
return await executorManager.getExecutor(environmentId);
}

public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
const executor = await this.getExecutor(environment.id);

if (environment.status === 'UNKNOWN') {
environment.status = 'USER_CANCELED';
this.releaseEnvironmentResource(environment);
return
}

const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
try {
await executor.killChildProcesses(jobpidPath);
this.releaseEnvironmentResource(environment);
} catch (error) {
this.log.error(`stopEnvironment: ${error}`);
}
}
}
24 changes: 24 additions & 0 deletions src/nni_manager/training_service/reusable/remote/remoteConfig.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

import { EnvironmentInformation } from '../environment';
import { RemoteMachineMeta } from '../../remote_machine/remoteMachineData';

/**
* RemoteMachineEnvironmentInformation
*/
export class RemoteMachineEnvironmentInformation extends EnvironmentInformation {
public rmMachineMeta?: RemoteMachineMeta;
}

export class RemoteConfig {
public readonly reuse: boolean;

/**
* Constructor
* @param reuse If job is reusable for multiple trials
*/
constructor(reuse: boolean) {
this.reuse = reuse;
}
}
Loading

0 comments on commit 392e55f

Please sign in to comment.