Skip to content
This repository was archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'upstream/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
scarlett2018 committed Nov 13, 2018
2 parents 800751c + 183763e commit abeea90
Show file tree
Hide file tree
Showing 39 changed files with 165 additions and 44 deletions.
14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ BIN_FOLDER ?= $(ROOT_FOLDER)/bin
NNI_PKG_FOLDER ?= $(ROOT_FOLDER)/nni

## Dependency information
$(info $(_INFO) Installing dependencies, use local toolchain $(_END))
NNI_NODE_TARBALL ?= /tmp/nni-node-linux-x64.tar.xz
NNI_NODE_FOLDER = /tmp/nni-node-linux-x64
NNI_NODE ?= $(BIN_FOLDER)/node
Expand Down Expand Up @@ -104,6 +103,17 @@ uninstall:
-rm -f $(BIN_FOLDER)/nnictl
-rm -f $(BASH_COMP_SCRIPT)

.PHONY: clean
clean:
-rm -rf tools/build
-rm -rf tools/nnictl.egg-info
-rm -rf src/nni_manager/dist
-rm -rf src/nni_manager/node_modules
-rm -rf src/sdk/pynni/build
-rm -rf src/sdk/pynni/nni_sdk.egg-info
-rm -rf src/webui/build
-rm -rf src/webui/node_modules

# Main targets end

# Helper targets
Expand All @@ -116,7 +126,7 @@ $(NNI_YARN_TARBALL):
#$(_INFO) Downloading Yarn $(_END)
wget https://aka.ms/yarn-download -O $(NNI_YARN_TARBALL)

.PHONY: intall-dependencies
.PHONY: install-dependencies
install-dependencies: $(NNI_NODE_TARBALL) $(NNI_YARN_TARBALL)
#$(_INFO) Extracting Node.js $(_END)
rm -rf $(NNI_NODE_FOLDER)
Expand Down
14 changes: 11 additions & 3 deletions deployment/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,17 @@ RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.10.0
#
RUN python3 -m pip --no-cache-dir install Keras==2.1.6

#sklearn
RUN python3 -m pip --no-cache-dir install scikit-learn
#
#PyTorch
#
RUN python3 -m pip --no-cache-dir install torch==0.4.1
RUN python3 -m pip install torchvision==0.2.1

#
#sklearn 0.20.0
#
RUN python3 -m pip --no-cache-dir install scikit-learn==0.20.0

ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/bin:/sbin

WORKDIR /root
WORKDIR /root
19 changes: 13 additions & 6 deletions deployment/docker/README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
Dockerfile
===
## 1.Description
This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly.
Dockerfile could build the customized docker image, users could build their customized docker image using this file.
This docker file includes the following libraries on `Ubuntu 16.04 LTS`:
This is the Dockerfile of nni project. It includes serveral popular deep learning frameworks and NNI. It is tested on `Ubuntu 16.04 LTS`:

```
CUDA 9.0, CuDNN 7.0
numpy 1.14.3,scipy 1.1.0
TensorFlow 1.5.0
Keras 2.1.6
PyTorch 0.4.1
scikit-learn 0.20.0
NNI v0.3
```
You can take this Dockerfile as a reference for your own customized Dockerfile.

## 2.How to build and run
__Use the following command to build docker image__
```
__Use the following command from `nni/deployment/docker` to build docker image__
```
docker build -t nni/nni .
```
__Run the docker image__
Expand All @@ -30,4 +31,10 @@ __Run the docker image__
or
```
docker run --runtime=nvidia -it nni/nni
```
```

## 3.Directly retrieve the docker image
Use the following command to retrieve the NNI docker image from Docker Hub
```
docker pull msranni/nni:latest
```
2 changes: 1 addition & 1 deletion docs/InstallNNI_Ubuntu.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

* __Install NNI in docker image__

You can also install NNI in a docker image. Please follow the instructions [here](../deployment/docker/README.md) to build NNI docker image.
You can also install NNI in a docker image. Please follow the instructions [here](../deployment/docker/README.md) to build NNI docker image. The NNI docker image can also be retrieved from Docker Hub through the command `docker pull msranni/nni:latest`.

## Further reading
* [Overview](Overview.md)
Expand Down
29 changes: 27 additions & 2 deletions docs/tutorial_1_CR_exp_local_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ authorName: your_name
experimentName: auto_mnist
# how many trials could be concurrently running
trialConcurrency: 2
trialConcurrency: 1
# maximum experiment running duration
maxExecDuration: 3h
Expand Down Expand Up @@ -133,4 +133,29 @@ With all these steps done, we can run the experiment with the following command:
You can refer to [here](NNICTLDOC.md) for more usage guide of *nnictl* command line tool.

## View experiment results
The experiment has been running now, NNI provides WebUI for you to view experiment progress, to control your experiment, and some other appealing features. The WebUI is opened by default by `nnictl create`.
The experiment has been running now. Oher than *nnictl*, NNI also provides WebUI for you to view experiment progress, to control your experiment, and some other appealing features.

## Using multiple local GPUs to speed up search
The following steps assume that you have 4 NVIDIA GPUs installed at local and [tensorflow with GPU support](https://www.tensorflow.org/install/gpu). The demo enables 4 concurrent trail jobs and each trail job uses 1 GPU.

**Prepare configure file**: NNI provides a demo configuration file for the setting above, `cat ~/nni/examples/trials/mnist-annotation/config_gpu.yml` to see it. The trailConcurrency and gpuNum are different from the basic configure file:

```
...
# how many trials could be concurrently running
trialConcurrency: 4
...
trial:
command: python mnist.py
codeDir: ~/nni/examples/trials/mnist-annotation
gpuNum: 1
```

We can run the experiment with the following command:

nnictl create --config ~/nni/examples/trials/mnist-annotation/config_gpu.yml

You can use *nnictl* command line tool or WebUI to trace the training progress. *nvidia_smi* command line tool can also help you to monitor the GPU usage during training.
20 changes: 20 additions & 0 deletions examples/trials/mnist-annotation/config_gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
authorName: default
experimentName: example_mnist
trialConcurrency: 4
maxExecDuration: 1h
maxTrialNum: 10
#choice: local, remote, pai
trainingServicePlatform: local
#choice: true, false
useAnnotation: true
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner
#SMAC (SMAC should be installed through nnictl)
builtinTunerName: TPE
classArgs:
#choice: maximize, minimize
optimize_mode: maximize
trial:
command: python3 mnist.py
codeDir: .
gpuNum: 1
20 changes: 16 additions & 4 deletions src/nni_manager/common/experimentStartupInfo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,17 @@ import * as component from '../common/component';
class ExperimentStartupInfo {
private experimentId: string = '';
private newExperiment: boolean = true;
private basePort: number = -1;
private initialized: boolean = false;
private initTrialSequenceID: number = 0;

public setStartupInfo(newExperiment: boolean, experimentId: string): void {
public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void {
assert(!this.initialized);
assert(experimentId.trim().length > 0);

this.newExperiment = newExperiment;
this.experimentId = experimentId;
this.basePort = basePort;
this.initialized = true;
}

Expand All @@ -44,6 +46,12 @@ class ExperimentStartupInfo {
return this.experimentId;
}

public getBasePort(): number {
assert(this.initialized);

return this.basePort;
}

public isNewExperiment(): boolean {
assert(this.initialized);

Expand All @@ -66,6 +74,10 @@ function getExperimentId(): string {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getExperimentId();
}

function getBasePort(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getBasePort();
}

function isNewExperiment(): boolean {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).isNewExperiment();
}
Expand All @@ -78,9 +90,9 @@ function getInitTrialSequenceId(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId();
}

function setExperimentStartupInfo(newExperiment: boolean, experimentId: string): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId);
function setExperimentStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId, basePort);
}

export { ExperimentStartupInfo, getExperimentId, isNewExperiment,
export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId };
16 changes: 12 additions & 4 deletions src/nni_manager/common/restServer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@

'use strict';

import * as assert from 'assert';
import * as express from 'express';
import * as http from 'http';
import { Deferred } from 'ts-deferred';
import { getLogger, Logger } from './log';
import { getBasePort } from './experimentStartupInfo';

/**
* Abstraction class to create a RestServer
Expand All @@ -39,13 +41,20 @@ export abstract class RestServer {
protected port?: number;
protected app: express.Application = express();
protected log: Logger = getLogger();
protected basePort?: number;

constructor() {
this.port = getBasePort();
assert(this.port && this.port > 1024);
}

get endPoint(): string {
// tslint:disable-next-line:no-http-string
return `http://${this.hostName}:${this.port}`;
}

public start(port?: number, hostName?: string): Promise<void> {
public start(hostName?: string): Promise<void> {
this.log.info(`RestServer start`);
if (this.startTask !== undefined) {
return this.startTask.promise;
}
Expand All @@ -56,9 +65,8 @@ export abstract class RestServer {
if (hostName) {
this.hostName = hostName;
}
if (port) {
this.port = port;
}

this.log.info(`RestServer base port is ${this.port}`);

this.server = this.app.listen(this.port as number, this.hostName).on('listening', () => {
this.startTask.resolve();
Expand Down
2 changes: 1 addition & 1 deletion src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ function prepareUnitTest(): void {
Container.snapshot(TrainingService);
Container.snapshot(Manager);

setExperimentStartupInfo(true, 'unittest');
setExperimentStartupInfo(true, 'unittest', 8080);
mkDirPSync(getLogDir());

const sqliteFile: string = path.join(getDefaultDatabaseDir(), 'nni.sqlite');
Expand Down
5 changes: 5 additions & 0 deletions src/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,9 @@ class NNIManager implements Manager {

private async requestTrialJobsStatus(): Promise<number> {
let finishedTrialJobNum: number = 0;
if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup');
}
for (const trialJobId of Array.from(this.trialJobs.keys())) {
const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
Expand All @@ -367,13 +370,15 @@ class NNIManager implements Manager {
case 'USER_CANCELED':
this.trialJobs.delete(trialJobId);
finishedTrialJobNum++;
this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({trial_job_id: trialJobDetail.id, event: trialJobDetail.status}));
break;
case 'FAILED':
case 'SYS_CANCELED':
// In the current version, we do not retry
// TO DO: push this job to queue for retry
this.trialJobs.delete(trialJobId);
finishedTrialJobNum++;
this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({trial_job_id: trialJobDetail.id, event: trialJobDetail.status}));
break;
case 'WAITING':
case 'RUNNING':
Expand Down
8 changes: 4 additions & 4 deletions src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ import {
import { PAITrainingService } from './training_service/pai/paiTrainingService'


function initStartupInfo(startExpMode: string, resumeExperimentId: string) {
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
const createNew: boolean = (startExpMode === 'new');
const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
setExperimentStartupInfo(createNew, expId);
setExperimentStartupInfo(createNew, expId, basePort);
}

async function initContainer(platformMode: string): Promise<void> {
Expand Down Expand Up @@ -93,14 +93,14 @@ if (startMode === 'resume' && experimentId.trim().length < 1) {
process.exit(1);
}

initStartupInfo(startMode, experimentId);
initStartupInfo(startMode, experimentId, port);

mkDirP(getLogDir()).then(async () => {
const log: Logger = getLogger();
try {
await initContainer(mode);
const restServer: NNIRestServer = component.get(NNIRestServer);
await restServer.start(port);
await restServer.start();
log.info(`Rest server listening on: ${restServer.endPoint}`);
} catch (err) {
log.error(`${err.stack}`);
Expand Down
4 changes: 2 additions & 2 deletions src/nni_manager/training_service/pai/paiData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ fi`;
export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}' --pai_hdfs_output_dir '{6}'
--pai_hdfs_host '{7}' --pai_user_name {8}`;
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}' --nnimanager_port '{6}'
--pai_hdfs_output_dir '{7}' --pai_hdfs_host '{8}' --pai_user_name {9}`;

export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`;
Expand Down
14 changes: 13 additions & 1 deletion src/nni_manager/training_service/pai/paiJobRestServer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@

'use strict';

import * as assert from 'assert';
import { Request, Response, Router } from 'express';
import * as bodyParser from 'body-parser';
import * as component from '../../common/component';
import { getBasePort } from '../../common/experimentStartupInfo';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { Inject } from 'typescript-ioc';
import { PAITrainingService } from './paiTrainingService';
Expand All @@ -48,10 +50,20 @@ export class PAIJobRestServer extends RestServer{
*/
constructor() {
super();
this.port = PAIJobRestServer.DEFAULT_PORT;
const basePort: number = getBasePort();
assert(basePort && basePort > 1024);

this.port = basePort + 1; // PAIJobRestServer.DEFAULT_PORT;
this.paiTrainingService = component.get(PAITrainingService);
}

public get paiRestServerPort(): number {
if(!this.port) {
throw new Error('PAI Rest server port is undefined');
}
return this.port;
}

/**
* NNIRestServer's own router registration
*/
Expand Down
Loading

0 comments on commit abeea90

Please sign in to comment.