diff --git a/Makefile b/Makefile index cbbb4d66d7..2dd1e42458 100644 --- a/Makefile +++ b/Makefile @@ -3,30 +3,45 @@ PIP_INSTALL := python3 -m pip install PIP_UNINSTALL := python3 -m pip uninstall -## Colorful output -_INFO := $(shell echo -e '\e[1;36m') -_WARNING := $(shell echo -e '\e[1;33m') -_END := $(shell echo -e '\e[0m') +# detect OS +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S), Linux) + OS_SPEC := linux + ## Colorful output + _INFO := $(shell echo -e '\e[1;36m') + _WARNING := $(shell echo -e '\e[1;33m') + _END := $(shell echo -e '\e[0m') +else ifeq ($(UNAME_S), Darwin) + OS_SPEC := darwin +else + $(error platform $(UNAME_S) not supported) +endif + + ## Install directories ifeq ($(shell id -u), 0) # is root _ROOT := 1 ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]).parents[2])') - BASH_COMP_SCRIPT ?= /usr/share/bash-completion/completions/nnictl + BASH_COMP_PREFIX ?= /usr/share/bash-completion/completions else # is normal user ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getusersitepackages()).parents[2])') ifndef VIRTUAL_ENV PIP_MODE ?= --user endif - BASH_COMP_SCRIPT ?= ${HOME}/.bash_completion.d/nnictl + BASH_COMP_PREFIX ?= ${HOME}/.bash_completion.d endif +BASH_COMP_SCRIPT := $(BASH_COMP_PREFIX)/nnictl + +NNI_INSTALL_PATH ?= $(INSTALL_PREFIX)/nni +NNI_TMP_PATH ?= /tmp BIN_FOLDER ?= $(ROOT_FOLDER)/bin NNI_PKG_FOLDER ?= $(ROOT_FOLDER)/nni ## Dependency information -NNI_NODE_TARBALL ?= /tmp/nni-node-linux-x64.tar.xz -NNI_NODE_FOLDER = /tmp/nni-node-linux-x64 +NNI_NODE_TARBALL ?= /tmp/nni-node-$(OS_SPEC)-x64.tar.xz +NNI_NODE_FOLDER = /tmp/nni-node-$(OS_SPEC)-x64 NNI_NODE ?= $(BIN_FOLDER)/node NNI_YARN_TARBALL ?= /tmp/nni-yarn.tar.gz NNI_YARN_FOLDER ?= /tmp/nni-yarn @@ -120,7 +135,7 @@ clean: $(NNI_NODE_TARBALL): #$(_INFO) Downloading Node.js $(_END) - wget https://aka.ms/nodejs-download -O $(NNI_NODE_TARBALL) + wget https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(NNI_NODE_TARBALL) $(NNI_YARN_TARBALL): #$(_INFO) Downloading Yarn $(_END) @@ -176,7 +191,8 @@ dev-install-node-modules: .PHONY: install-scripts install-scripts: - install -Dm644 tools/bash-completion $(BASH_COMP_SCRIPT) + mkdir -p $(BASH_COMP_PREFIX) + install -m644 tools/bash-completion $(BASH_COMP_SCRIPT) .PHONY: update-bash-config ifndef _ROOT diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index 4dcd01d512..0fcdccc630 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -272,6 +272,14 @@ function getIPV4Address(): string { throw Error('getIPV4Address() failed because no valid IPv4 address found.') } +function getRemoteTmpDir(osType: string): string { + if (osType == 'linux') { + return '/tmp'; + } else { + throw Error(`remote OS ${osType} not supported`); + } +} + /** * Get the status of canceled jobs according to the hint isEarlyStopped */ @@ -279,5 +287,5 @@ function getJobCancelStatus(isEarlyStopped: boolean): TrialJobStatus { return isEarlyStopped ? 'EARLY_STOPPED' : 'USER_CANCELED'; } -export { generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getJobCancelStatus, +export {getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect }; diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index 78e06d4ebf..21b3b7fde8 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -3,7 +3,7 @@ "version": "1.0.0", "main": "index.js", "scripts": { - "postbuild": "cp -f --parent scripts/*.py ./dist/", + "postbuild": "cp -rf scripts ./dist/", "build": "tsc", "test": "mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --colors", "start": "node dist/main.js" diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index e59dc8a9b4..84092c14c5 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -169,7 +169,7 @@ class LocalTrainingService implements TrainingService { this.setTrialJobStatus(trialJob, 'FAILED'); try { const state: string = await fs.promises.readFile(path.join(trialJob.workingDirectory, '.nni', 'state'), 'utf8'); - const match: RegExpMatchArray | null = state.trim().match(/^(\d+)\s+(\d+)$/); + const match: RegExpMatchArray | null = state.trim().match(/^(\d+)\s+(\d+)/); if (match !== null) { const { 1: code, 2: timestamp } = match; if (parseInt(code, 10) === 0) { diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index d709e49dad..6f3b1400a6 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -36,7 +36,7 @@ import { ObservableTimer } from '../../common/observableTimer'; import { HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; -import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus } from '../../common/utils'; +import { delay, generateParamFileName, getExperimentRootDir, uniqueString, getJobCancelStatus, getRemoteTmpDir } from '../../common/utils'; import { GPUSummary } from '../common/gpuData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; @@ -66,8 +66,10 @@ class RemoteMachineTrainingService implements TrainingService { private log: Logger; private isMultiPhase: boolean = false; private trialSequenceId: number; + private readonly remoteOS: string; constructor(@component.Inject timer: ObservableTimer) { + this.remoteOS = 'linux'; this.metricsEmitter = new EventEmitter(); this.trialJobsMap = new Map(); this.machineSSHClientMap = new Map(); @@ -372,7 +374,7 @@ class RemoteMachineTrainingService implements TrainingService { // Copy NNI scripts to remote expeirment working directory const remoteScriptsDir: string = this.getRemoteScriptsPath(); await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteScriptsDir}`, conn); - await SSHClientUtility.copyDirectoryToRemote('./scripts', remoteScriptsDir, conn); + await SSHClientUtility.copyDirectoryToRemote('./scripts', remoteScriptsDir, conn, this.remoteOS); await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn); //Begin to execute gpu_metrics_collection scripts @@ -485,7 +487,7 @@ class RemoteMachineTrainingService implements TrainingService { await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta); // Copy files in codeDir to remote working directory - await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient); + await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient, this.remoteOS); // Execute command in remote machine SSHClientUtility.remoteExeCommand(`bash ${path.join(trialWorkingFolder, 'run.sh')}`, sshClient); } @@ -576,7 +578,7 @@ class RemoteMachineTrainingService implements TrainingService { } private getRemoteExperimentRootDir(): string{ - return path.join(os.tmpdir(), 'nni', 'experiments', getExperimentId()); + return path.join(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId()); } private getJobPidPath(jobId: string): string { diff --git a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts index 141f4b7e96..04f6e7fdad 100644 --- a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts +++ b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts @@ -28,7 +28,7 @@ import * as stream from 'stream'; import { Deferred } from 'ts-deferred'; import { NNIError, NNIErrorNames } from '../../common/errors'; import { getLogger } from '../../common/log'; -import { uniqueString } from '../../common/utils'; +import { uniqueString, getRemoteTmpDir } from '../../common/utils'; import { RemoteCommandResult } from './remoteMachineData'; /** @@ -43,11 +43,11 @@ export namespace SSHClientUtility { * @param remoteDirectory remote directory * @param sshClient SSH client */ - export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client) : Promise { + export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client, remoteOS: string) : Promise { const deferred: Deferred = new Deferred(); const tmpTarName: string = `${uniqueString(10)}.tar.gz`; const localTarPath: string = path.join(os.tmpdir(), tmpTarName); - const remoteTarPath: string = path.join(os.tmpdir(), tmpTarName); + const remoteTarPath: string = path.join(getRemoteTmpDir(remoteOS), tmpTarName); // Compress files in local directory to experiment root directory await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`);