Skip to content

Commit

Permalink
mac support with local, remote & pai mode (microsoft#386)
Browse files Browse the repository at this point in the history
* update Makefile for mac support, wait for aka.ms support

* refix Makefile for colorful echo

* update Makefile with shorturl

* fix false fail on mac webui

* fix cross os remote tmpdir issue

* add readonly to RemoteMachineTrainingService.remoteOS

* fix var name for PR 386
  • Loading branch information
leckie-chn authored and yds05 committed Nov 27, 2018
1 parent 694bb53 commit 101b02f
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 20 deletions.
36 changes: 26 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,45 @@
PIP_INSTALL := python3 -m pip install
PIP_UNINSTALL := python3 -m pip uninstall

## Colorful output
_INFO := $(shell echo -e '\e[1;36m')
_WARNING := $(shell echo -e '\e[1;33m')
_END := $(shell echo -e '\e[0m')
# detect OS
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S), Linux)
OS_SPEC := linux
## Colorful output
_INFO := $(shell echo -e '\e[1;36m')
_WARNING := $(shell echo -e '\e[1;33m')
_END := $(shell echo -e '\e[0m')
else ifeq ($(UNAME_S), Darwin)
OS_SPEC := darwin
else
$(error platform $(UNAME_S) not supported)
endif



## Install directories
ifeq ($(shell id -u), 0) # is root
_ROOT := 1
ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]).parents[2])')
BASH_COMP_SCRIPT ?= /usr/share/bash-completion/completions/nnictl
BASH_COMP_PREFIX ?= /usr/share/bash-completion/completions
else # is normal user
ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getusersitepackages()).parents[2])')
ifndef VIRTUAL_ENV
PIP_MODE ?= --user
endif
BASH_COMP_SCRIPT ?= ${HOME}/.bash_completion.d/nnictl
BASH_COMP_PREFIX ?= ${HOME}/.bash_completion.d
endif
BASH_COMP_SCRIPT := $(BASH_COMP_PREFIX)/nnictl

NNI_INSTALL_PATH ?= $(INSTALL_PREFIX)/nni
NNI_TMP_PATH ?= /tmp

BIN_FOLDER ?= $(ROOT_FOLDER)/bin
NNI_PKG_FOLDER ?= $(ROOT_FOLDER)/nni

## Dependency information
NNI_NODE_TARBALL ?= /tmp/nni-node-linux-x64.tar.xz
NNI_NODE_FOLDER = /tmp/nni-node-linux-x64
NNI_NODE_TARBALL ?= /tmp/nni-node-$(OS_SPEC)-x64.tar.xz
NNI_NODE_FOLDER = /tmp/nni-node-$(OS_SPEC)-x64
NNI_NODE ?= $(BIN_FOLDER)/node
NNI_YARN_TARBALL ?= /tmp/nni-yarn.tar.gz
NNI_YARN_FOLDER ?= /tmp/nni-yarn
Expand Down Expand Up @@ -120,7 +135,7 @@ clean:

$(NNI_NODE_TARBALL):
#$(_INFO) Downloading Node.js $(_END)
wget https://aka.ms/nodejs-download -O $(NNI_NODE_TARBALL)
wget https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(NNI_NODE_TARBALL)

$(NNI_YARN_TARBALL):
#$(_INFO) Downloading Yarn $(_END)
Expand Down Expand Up @@ -176,7 +191,8 @@ dev-install-node-modules:

.PHONY: install-scripts
install-scripts:
install -Dm644 tools/bash-completion $(BASH_COMP_SCRIPT)
mkdir -p $(BASH_COMP_PREFIX)
install -m644 tools/bash-completion $(BASH_COMP_SCRIPT)

.PHONY: update-bash-config
ifndef _ROOT
Expand Down
10 changes: 9 additions & 1 deletion src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -272,12 +272,20 @@ function getIPV4Address(): string {
throw Error('getIPV4Address() failed because no valid IPv4 address found.')
}

function getRemoteTmpDir(osType: string): string {
if (osType == 'linux') {
return '/tmp';
} else {
throw Error(`remote OS ${osType} not supported`);
}
}

/**
* Get the status of canceled jobs according to the hint isEarlyStopped
*/
function getJobCancelStatus(isEarlyStopped: boolean): TrialJobStatus {
return isEarlyStopped ? 'EARLY_STOPPED' : 'USER_CANCELED';
}

export { generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getJobCancelStatus,
export {getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getJobCancelStatus,
getDefaultDatabaseDir, getIPV4Address, mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect };
2 changes: 1 addition & 1 deletion src/nni_manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"version": "1.0.0",
"main": "index.js",
"scripts": {
"postbuild": "cp -f --parent scripts/*.py ./dist/",
"postbuild": "cp -rf scripts ./dist/",
"build": "tsc",
"test": "mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --colors",
"start": "node dist/main.js"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ class LocalTrainingService implements TrainingService {
this.setTrialJobStatus(trialJob, 'FAILED');
try {
const state: string = await fs.promises.readFile(path.join(trialJob.workingDirectory, '.nni', 'state'), 'utf8');
const match: RegExpMatchArray | null = state.trim().match(/^(\d+)\s+(\d+)$/);
const match: RegExpMatchArray | null = state.trim().match(/^(\d+)\s+(\d+)/);
if (match !== null) {
const { 1: code, 2: timestamp } = match;
if (parseInt(code, 10) === 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer';
import {
HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus } from '../../common/utils';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir } from '../../common/utils';
import { GPUSummary } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
Expand Down Expand Up @@ -66,8 +66,10 @@ class RemoteMachineTrainingService implements TrainingService {
private log: Logger;
private isMultiPhase: boolean = false;
private trialSequenceId: number;
private readonly remoteOS: string;

constructor(@component.Inject timer: ObservableTimer) {
this.remoteOS = 'linux';
this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>();
this.machineSSHClientMap = new Map<RemoteMachineMeta, Client>();
Expand Down Expand Up @@ -372,7 +374,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy NNI scripts to remote expeirment working directory
const remoteScriptsDir: string = this.getRemoteScriptsPath();
await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteScriptsDir}`, conn);
await SSHClientUtility.copyDirectoryToRemote('./scripts', remoteScriptsDir, conn);
await SSHClientUtility.copyDirectoryToRemote('./scripts', remoteScriptsDir, conn, this.remoteOS);
await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn);

//Begin to execute gpu_metrics_collection scripts
Expand Down Expand Up @@ -485,7 +487,7 @@ class RemoteMachineTrainingService implements TrainingService {
await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta);

// Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient);
await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient, this.remoteOS);
// Execute command in remote machine
SSHClientUtility.remoteExeCommand(`bash ${path.join(trialWorkingFolder, 'run.sh')}`, sshClient);
}
Expand Down Expand Up @@ -576,7 +578,7 @@ class RemoteMachineTrainingService implements TrainingService {
}

private getRemoteExperimentRootDir(): string{
return path.join(os.tmpdir(), 'nni', 'experiments', getExperimentId());
return path.join(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId());
}

private getJobPidPath(jobId: string): string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import * as stream from 'stream';
import { Deferred } from 'ts-deferred';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { getLogger } from '../../common/log';
import { uniqueString } from '../../common/utils';
import { uniqueString, getRemoteTmpDir } from '../../common/utils';
import { RemoteCommandResult } from './remoteMachineData';

/**
Expand All @@ -43,11 +43,11 @@ export namespace SSHClientUtility {
* @param remoteDirectory remote directory
* @param sshClient SSH client
*/
export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client) : Promise<void> {
export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client, remoteOS: string) : Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
const tmpTarName: string = `${uniqueString(10)}.tar.gz`;
const localTarPath: string = path.join(os.tmpdir(), tmpTarName);
const remoteTarPath: string = path.join(os.tmpdir(), tmpTarName);
const remoteTarPath: string = path.join(getRemoteTmpDir(remoteOS), tmpTarName);

// Compress files in local directory to experiment root directory
await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`);
Expand Down

0 comments on commit 101b02f

Please sign in to comment.