Skip to content

Commit

Permalink
Merge pull request #10 from SparkSnail/dev-windows
Browse files Browse the repository at this point in the history
Support gpu in windows
  • Loading branch information
demianzhang authored Apr 3, 2019
2 parents fe86617 + 2b3e99c commit 6fee8fc
Show file tree
Hide file tree
Showing 9 changed files with 189 additions and 19 deletions.
3 changes: 2 additions & 1 deletion src/nni_manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
},
"license": "MIT",
"dependencies": {
"immutable": "^3.7.4",
"azure-storage": "^2.10.2",
"chai-as-promised": "^7.1.1",
"child-process-promise": "^2.2.1",
"express": "^4.16.3",
"express-joi-validator": "^2.0.0",
"express-joi-validator": "^2.0.1",
"js-base64": "^2.4.9",
"kubernetes-client": "^6.5.0",
"rx": "^4.1.0",
Expand Down
9 changes: 8 additions & 1 deletion src/nni_manager/training_service/common/gpuData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,17 @@ export class GPUSummary {
}
}

export const GPU_INFO_COLLECTOR_FORMAT: string =
export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`

export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
`
$env:METRIC_OUTPUT_DIR="{0}"
Write $PID | Out-File {1} -NoNewline -encoding utf8
cmd /c python -m nni_gpu_tool.gpu_metrics_collector
`
108 changes: 107 additions & 1 deletion src/nni_manager/training_service/common/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ import { getLogger } from "common/log";
'use strict';

import { countFilesRecursively } from '../../common/utils'
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData'
import * as path from 'path';
import { String } from 'typescript-string-operations';
import { file } from "../../node_modules/@types/tmp";

/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
Expand All @@ -45,4 +51,104 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
}

return fileCount;
}
}

/**
* crete a new directory
* @param directory
*/
export async function execMkdir(directory: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe New-Item -Path ${directory} -ItemType "directory" -Force`);
} else {
await cpp.exec(`mkdir -p ${directory}`);
}
return Promise.resolve();
}

/**
* run script
* @param filePath
*/
export function execScript(filePath: string): void {
if (process.platform === 'win32') {
cp.exec(`powershell.exe -file ${filePath}`);
} else {
cp.exec(`bash ${filePath}`);
}
}



/**
* output the last line of a file
* @param filePath
*/
export async function execTail(filePath: string): Promise<cpp.childProcessPromise.Result> {
let cmdresult: cpp.childProcessPromise.Result;
if (process.platform === 'win32') {
cmdresult = await cpp.exec(`powershell.exe Get-Content ${filePath} -Tail 1`);
} else {
cmdresult = await cpp.exec(`tail -n 1 ${filePath}`);
}
return Promise.resolve(cmdresult);
}

/**
* delete a directory
* @param directory
*/
export async function execRemove(directory: string): Promise<void>{
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe Remove-Item ${directory}`);
} else {
await cpp.exec(`rm -rf ${directory}`);
}
return Promise.resolve();
}

/**
* kill a process
* @param directory
*/
export async function execKill(pid: string): Promise<void>{
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe kill ${pid}`);
} else {
await cpp.exec(`pkill -P ${pid}`);
}
return Promise.resolve();
}


/**
* generate script file name
* @param fileNamePrefix
*/
export function getScriptName(fileNamePrefix: string): string {
if (process.platform === 'win32') {
return fileNamePrefix + '.ps1';
} else {
return fileNamePrefix + '.sh';
}
}

/**
* generate script file
* @param gpuMetricCollectorScriptFolder
*/
export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string {
if(process.platform === 'win32') {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'),
);
} else {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'),
);
}
}
24 changes: 11 additions & 13 deletions src/nni_manager/training_service/local/gpuScheduler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import * as path from 'path';
import * as os from 'os';
import * as fs from 'fs';
import { String } from 'typescript-string-operations';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData'
import { execMkdir, getScriptName, getgpuMetricsCollectorScriptContent, execScript, execTail, execRemove, execKill } from '../common/util'

/**
* GPUScheduler
Expand Down Expand Up @@ -63,16 +63,14 @@ class GPUScheduler {
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid'),
);
await execMkdir(this.gpuMetricCollectorScriptFolder);
//generate gpu_metrics_collector script
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, getScriptName('gpu_metrics_collector'));
const gpuMetricsCollectorScriptContent: string = getgpuMetricsCollectorScriptContent(this.gpuMetricCollectorScriptFolder);
console.log(this.gpuMetricCollectorScriptFolder)
console.log(gpuMetricsCollectorScriptContent)
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
execScript(gpuMetricsCollectorScriptPath)
}

public getAvailableGPUIndices(): number[] {
Expand All @@ -87,15 +85,15 @@ class GPUScheduler {
this.stopping = true;
try {
const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
await cpp.exec(`pkill -P ${pid}`);
await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`);
await execKill(pid);
await execRemove(this.gpuMetricCollectorScriptFolder);
} catch (error){
this.log.error(`GPU scheduler error: ${error}`);
}
}

private async updateGPUSummary() {
const cmdresult = await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
const cmdresult = await execTail(path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics'));
if(cmdresult && cmdresult.stdout) {
this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ import {
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT
} from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
Expand Down Expand Up @@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
GPU_INFO_COLLECTOR_FORMAT_LINUX,
remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'),
);
Expand Down
16 changes: 16 additions & 0 deletions test/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import sys
import glob
import argparse
from utils import get_yml_content, dump_yml_content

Expand Down Expand Up @@ -69,6 +71,19 @@ def update_training_service_config(args):

dump_yml_content(TRAINING_SERVICE_FILE, config)

def convert_command():
'''convert command by platform'''
if sys.platform != 'win32':
return None
config_files = glob.glob('./**/*.yml') + glob.glob('./**/**/*.yml')
for config_file in config_files:
print('processing {}'.format(config_file))
yml_content = get_yml_content(config_file)
if yml_content.get('trial'):
if yml_content['trial'].get('command'):
yml_content['trial']['command'] = yml_content['trial']['command'].replace('python3', 'python')
dump_yml_content(config_file, yml_content)

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote'], default='pai')
Expand Down Expand Up @@ -96,3 +111,4 @@ def update_training_service_config(args):
args = parser.parse_args()

update_training_service_config(args)
convert_command()
36 changes: 36 additions & 0 deletions test/pipelines-it-local-windows.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
jobs:
- job: 'Test'

steps:
- script: |
powershell.exe -file install.ps1
displayName: 'Install nni toolkit via source code'
- script: |
python -m pip install scikit-learn==0.20.0 --user
python -m pip install keras==2.1.6 --user
python -m pip install tensorflow-gpu==1.10.0 --user
displayName: 'Install dependencies for integration tests'
- script: |
cd test
python generate_ts_config.py
displayName: 'generate config files'
- script: |
cd test
powershell.exe -file unittest.ps1
displayName: 'unit test'
- script: |
cd test
python naive_test.py
displayName: 'Naive test'
- script: |
cd test
python tuner_test.py
displayName: 'Built-in tuners / assessors tests'
- script: |
cd test
python metrics_test.py
displayName: 'Trial job metrics test'
- script: |
cd test
python config_test.py --ts local --exclude cifar10,smac,batchtuner
displayName: 'Examples and advanced features tests on local machine'
5 changes: 4 additions & 1 deletion test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,10 @@ def print_stderr(trial_jobs_url):
for trial_job in trial_jobs:
if trial_job['status'] == 'FAILED':
stderr_path = trial_job['stderrPath'].split(':')[-1]
subprocess.run(['cat', stderr_path])
if sys.platform == "win32":
subprocess.run(['type', stderr_path], shell=True)
else:
subprocess.run(['cat', stderr_path])

def parse_max_duration_time(max_exec_duration):
unit = max_exec_duration[-1]
Expand Down
3 changes: 3 additions & 0 deletions tools/nni_gpu_tool/gpu_metrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
from xml.dom import minidom

def check_ready_to_run():
#TODO check process in windows
if sys.platform == 'win32':
return True
pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
Expand Down

0 comments on commit 6fee8fc

Please sign in to comment.