Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Training container for NAS Envelopenet #429

Merged
merged 3 commits into from
May 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions examples/NAS-Envelopenet-trainingcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
ARG cuda_version=9.0
ARG cudnn_version=7
FROM nvidia/cuda:${cuda_version}-cudnn${cudnn_version}-devel

# Install system packages
RUN apt-get update && apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y --no-install-recommends \
bzip2 \
g++ \
git \
graphviz \
libgl1-mesa-glx \
libhdf5-dev \
openmpi-bin \
python3.5 \
python3-pip \
python3-setuptools \
python3-dev \
wget && \
rm -rf /var/lib/apt/lists/*


ADD . /app
WORKDIR /app

RUN pip3 install --upgrade pip
RUN pip3 install --no-cache-dir -r requirements.txt
ENV PYTHONPATH /app

ENTRYPOINT ["python3.5", "-u", "run_trial.py"]
18 changes: 18 additions & 0 deletions examples/NAS-Envelopenet-trainingcontainer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# About the training container

The algorithm follows the idea proposed in *Fast Neural Architecture Construction using EnvelopeNets* by Kamath et al.(https://arxiv.org/pdf/1803.06744.pdf). It is not a Reinforcement Learning or evolution based NAS,
rather a method to construct deep network

# How this code works

Firstly the yaml file is parsed using Operation.py and suggestion_param.py. Then in nasenvelopenet_service.py suggestion, calls nac_gen.py to generate initial architecture. Then it passes this to run_trial.py.
run_trial.py is entrypoint. This is called from the suggestion. It invokes Model Constructor which constructs the model. There is a parameter in the algorithm which is max_iterations, is used as a maximum number of restructuring
iterations of the model. When this is reached, it evaluates the model.
Based on this, suggestion calls generate_arch.py to improve the architecture from the metrics collected, and this loop runs till max_iterations.

Model Constructor uses net.py various methods to build the model, which itself uses cell_classification.py, cell_init.py and cell_main.py as a definition of the initial cell, the envelopecell and the classification cell used
to build the model. cifar10_input.py is used for various methods needed for the CIFAR-10 dataset. Evaluate.py has various methods for testing.

# How to run this code

I have attached a testing code test.py which I used to parse the yaml file and run this locally. But there have been changes in the code after I tested it on Katib. So you might need to change something.
10 changes: 10 additions & 0 deletions examples/NAS-Envelopenet-trainingcontainer/cell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Base cell."""

import tensorflow as tf
class Cell:
"""Base cell: All cells derived from this class."""
def __init__(self):
pass
def get_params(self):
"""Get tf params"""
print([tensor.name for tensor in tf.get_default_graph().as_graph_def().node])
52 changes: 52 additions & 0 deletions examples/NAS-Envelopenet-trainingcontainer/cell_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Classification cell"""

import tensorflow as tf
from cell import Cell

slim = tf.contrib.slim

def trunc_normal(stddev):
return tf.truncated_normal_initializer(0.0, stddev)

class Classification(Cell):
"""Classification cell: The final classification block of a CNN"""
def __init__(self):
self.cellname = "Classification"
Cell.__init__(self)

def cell(self, inputs, arch, is_training):
"""Create the cell by instantiating the cell blocks"""
nscope = 'Cell_{}'.format(self.cellname)
net = inputs
reuse = None
with tf.variable_scope(nscope, 'classification_block', [inputs], reuse=reuse) as scope:
for layer in sorted(arch.keys()):
for branch in sorted(arch[layer].keys()):
block = arch[layer][branch]
if block["block"] == "reduce_mean":
net = tf.reduce_mean(net, [1, 2])
elif block["block"] == "flatten":
net = slim.flatten(net)
elif block["block"] == "fc":
outputs = block["outputs"]
net = slim.fully_connected(net, outputs)
elif block["block"] == "fc-final":
outputs = block["outputs"]
inputs = block["inputs"]
weights_initializer = trunc_normal(1 / float(inputs))
biases_initializer = tf.zeros_initializer()
net = slim.fully_connected(
net,
outputs,
biases_initializer=biases_initializer,
weights_initializer=weights_initializer,
weights_regularizer=None,
activation_fn=None)
elif block["block"] == "dropout":
keep_prob = block["keep_prob"]
net = slim.dropout(
net, keep_prob=keep_prob, is_training=is_training)
else:
print("Invalid block")
exit(-1)
return net
76 changes: 76 additions & 0 deletions examples/NAS-Envelopenet-trainingcontainer/cell_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Initialization (Stem) cell"""
import tensorflow as tf
from cell import Cell

slim = tf.contrib.slim

def trunc_normal(stddev):
return tf.truncated_normal_initializer(0.0, stddev)

class Init(Cell):
"""Initialization (Stem) cell: The first cell of a CNN"""
def __init__(self, cellidx):
self.cellidx = cellidx
self.cellname = "Init"
Cell.__init__(self)

def cell(self, inputs, arch, is_training):
"""Create the cell by instantiating the cell blocks"""
nscope = "Cell_{}_{}".format(self.cellname,self.cellidx)
reuse = None
with tf.variable_scope(nscope, 'initial_block', [inputs], reuse=reuse) as scope:
with slim.arg_scope([slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'):
net = inputs
for layer in sorted(arch.keys()):
cells = []
for branch in sorted(arch[layer].keys()):
block = arch[layer][branch]
if block["block"] == "conv2d":
output_filters = int(block["outputs"])
kernel_size = block["kernel_size"]
if "stride" not in block.keys():
stride = 1
else:
stride = block["stride"]
cell = slim.conv2d(
net,
output_filters,
kernel_size,
stride=stride,
padding='SAME')
elif block["block"] == "max_pool":
kernel_size = block["kernel_size"]
cell = slim.max_pool2d(
net, kernel_size, padding='SAME', stride=2)
elif block["block"] == "lrn":
cell = tf.nn.lrn(
net, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
elif block["block"] == "dropout":
keep_prob = block["keep_prob"]
cell = slim.dropout(net, keep_prob=keep_prob, is_training=is_training)
elif block["block"] == "cutout":
if not is_training:
cell = net
else:
cutout_size = block["size"]
img_dim = int(net.shape[1])
batch_size = int(net.shape[0])
channels = int(net.shape[3])
""" Puts white rectange on a RGB image with random x,y coordinates """
mask = tf.ones([cutout_size, cutout_size], dtype=tf.int32)
start = tf.random_uniform([2], minval=0, maxval=img_dim, dtype=tf.int32)
mask = tf.pad(mask, [[cutout_size + start[0], img_dim - start[0]],
[cutout_size + start[1], img_dim - start[1]]])
mask = mask[cutout_size: cutout_size + img_dim,
cutout_size: cutout_size + img_dim]
mask = tf.reshape(mask, [img_dim, img_dim, 1])
mask = tf.expand_dims(mask, axis=0)
mask = tf.tile(mask, [batch_size, 1, 1, channels])
cell = tf.where(tf.equal(mask, 0), x=net, y=tf.zeros_like(net))
else:
print("Invalid block")
exit(-1)
cells.append(cell)
net = tf.concat(cells, axis=-1)

return net
115 changes: 115 additions & 0 deletions examples/NAS-Envelopenet-trainingcontainer/cell_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
""" Envelope Cell"""
import tensorflow as tf
from cell import Cell

slim = tf.contrib.slim

class CellEnvelope(Cell):
""" Defintion of an envelope cell"""
def __init__(
self,
cellidx,
channelwidth,
net,
filters,
log_stats,
outputs):
self.cellidx = cellidx
self.log_stats = log_stats
self.cellname = "Envelope"
self.numbins = 100
self.batchsize = int(net.shape[0])
self.output_per_filter = outputs
img_dims = int(net.shape[1])
self.imagesize = [img_dims, img_dims]
Cell.__init__(self)
scope = 'Cell{}'.format(self.cellidx)
if self.log_stats:
with tf.variable_scope(scope, reuse=False):
for branch in filters:
with tf.variable_scope(branch, reuse=False):
self.init_stats()

def cell(self, inputs, channelwidth, is_training=True, filters=None):
"""
Args:
inputs: a tensor of size [batch_size, height, width, channels].
By default use stride=1 and SAME padding
"""
dropout_keep_prob = 0.8
nscope = 'Cell_{}_{}'.format(self.cellname,self.cellidx)

scope = 'Cell{}'.format(self.cellidx)
nets = []
with tf.variable_scope(scope):
for branch in sorted(filters):
with tf.variable_scope(branch):
conv_h, conv_w = branch[0], branch[0]
outchannels = self.output_per_filter
if branch.endswith("sep"):
net = slim.separable_conv2d(
inputs, outchannels, [
conv_h, conv_w], 1, normalizer_fn=slim.batch_norm)
else:
net = slim.conv2d(
inputs, outchannels, [
conv_h, conv_w], normalizer_fn=slim.batch_norm)
if self.log_stats:
msss = self.calc_stats(net, branch)
net = tf.Print(
net,
[msss],
message="MeanSSS=:{}/{}:".format(scope, branch))
net = slim.dropout(
net,
keep_prob=dropout_keep_prob,
scope='dropout',
is_training=is_training)
nets.append(net)
net = tf.concat(axis=3, values=nets)
return net

def init_stats(self):
size = [
self.batchsize,
self.imagesize[0],
self.imagesize[1],
self.output_per_filter]
sumsquaredsamples = tf.contrib.framework.model_variable(
"sumsquaredsamples", size, initializer=tf.zeros_initializer)
sumsamples = tf.contrib.framework.model_variable(
"sumsamples", size, initializer=tf.zeros_initializer)
samplecount = tf.contrib.framework.model_variable(
"samplecount", [1], initializer=tf.zeros_initializer)

def calc_stats(self, inputs, scope):
with tf.variable_scope(scope, reuse=True):
size = [
self.batchsize,
self.imagesize[0],
self.imagesize[1],
self.output_per_filter]
sumsquaredsamples = tf.get_variable("sumsquaredsamples", size)
sumsamples = tf.get_variable("sumsamples", size)

samplecount = tf.get_variable("samplecount", [1])
tsamplecount = tf.add(samplecount, tf.to_float(tf.constant(1)))
samplecount = samplecount.assign(tsamplecount)

""" input is N*H*W*C. We need to calcualte running variance over
time (i.e over the N Images in this batch and in all batches.
Hence need to reduce across the N dimension """
sum_across_batch = tf.reduce_sum(inputs, axis=0)
tsumsamples = tf.add(sumsamples, sum_across_batch)
sumsamples = sumsamples.assign(tsumsamples)
squared_inputs = tf.square(inputs)
squared_sum_across_batch = tf.reduce_sum(squared_inputs, axis=0)
tsumsquaredsamples = tf.add(
sumsquaredsamples, squared_sum_across_batch)
sumsquaredsamples = sumsquaredsamples.assign(tsumsquaredsamples)

msss = (1 / samplecount) * (sumsquaredsamples)
msss = tf.reduce_mean(msss)

return msss

Loading