From 0d1ca70699c0755f382a560faa2efa93789fc0db Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Fri, 29 Nov 2019 15:27:01 +0800 Subject: [PATCH 01/57] checkpoint --- examples/nas/spos/blocks.py | 85 ++++++++++ examples/nas/spos/flops.py | 55 +++++++ examples/nas/spos/network.py | 107 +++++++++++++ examples/nas/spos/train.py | 301 +++++++++++++++++++++++++++++++++++ examples/nas/spos/utils.py | 96 +++++++++++ 5 files changed, 644 insertions(+) create mode 100644 examples/nas/spos/blocks.py create mode 100644 examples/nas/spos/flops.py create mode 100644 examples/nas/spos/network.py create mode 100644 examples/nas/spos/train.py create mode 100644 examples/nas/spos/utils.py diff --git a/examples/nas/spos/blocks.py b/examples/nas/spos/blocks.py new file mode 100644 index 0000000000..883ceb4d95 --- /dev/null +++ b/examples/nas/spos/blocks.py @@ -0,0 +1,85 @@ +import torch +import torch.nn as nn + + +class ShuffleNetBlock(nn.Module): + """ + When stride = 1, the block receives input with 2 * inp channels. Otherwise inp channels. + """ + + def __init__(self, inp, oup, mid_channels, ksize, stride, sequence="pdp"): + super().__init__() + assert stride in [1, 2] + assert ksize in [3, 5, 7] + assert oup > inp + + self.inp = inp + self.oup = oup + self.mid_channels = mid_channels + self.ksize = ksize + self.stride = stride + self.pad = ksize // 2 + self.oup_main = oup - inp + + self.branch_main = nn.Sequential(*self._decode_point_depth_conv(sequence)) + + if stride == 2: + self.branch_proj = nn.Sequential( + # dw + nn.Conv2d(inp, inp, ksize, stride, self.pad, groups=inp, bias=False), + nn.BatchNorm2d(inp, affine=False), + # pw-linear + nn.Conv2d(inp, inp, 1, 1, 0, bias=False), + nn.BatchNorm2d(inp, affine=False), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + if self.stride == 2: + x_proj, x = self.branch_proj(x), x + else: + x_proj, x = self._channel_shuffle(x) + return torch.cat((x_proj, self.branch_main(x)), 1) + + def _decode_point_depth_conv(self, sequence): + result = [] + first_depth = first_point = True + pc = c = self.inp + for i, token in enumerate(sequence): + # compute output channels of this conv + if i + 1 == len(sequence): + assert token == "p", "Last conv must be point-wise conv." + c = self.oup_main + elif token == "p" and first_point: + c = self.mid_channels + if token == "d": + # depth-wise conv + assert pc == c, "Depth-wise conv must not change channels." + result.append(nn.Conv2d(pc, c, self.ksize, self.stride if first_depth else 1, self.pad, + groups=c, bias=False)) + result.append(nn.BatchNorm2d(c, affine=False)) + first_depth = False + elif token == "p": + # point-wise conv + result.append(nn.Conv2d(pc, c, 1, 1, 0, bias=False)) + result.append(nn.BatchNorm2d(c, affine=False)) + result.append(nn.ReLU(inplace=True)) + first_point = False + else: + raise ValueError("Conv sequence must be d and p.") + pc = c + return result + + def _channel_shuffle(self, x): + bs, num_channels, height, width = x.data.size() + assert (num_channels % 4 == 0) + x = x.reshape(bs * num_channels // 2, 2, height * width) + x = x.permute(1, 0, 2) + x = x.reshape(2, -1, num_channels // 2, height, width) + return x[0], x[1] + + +class ShuffleXceptionBlock(ShuffleNetBlock): + + def __init__(self, inp, oup, mid_channels, stride): + super().__init__(inp, oup, mid_channels, 3, 1, "dpdpdp") diff --git a/examples/nas/spos/flops.py b/examples/nas/spos/flops.py new file mode 100644 index 0000000000..33f3d9ea8a --- /dev/null +++ b/examples/nas/spos/flops.py @@ -0,0 +1,55 @@ +op_flops_dict = pickle.load(open('./data/op_flops_dict.pkl', 'rb')) +backbone_info = [ # inp, oup, img_h, img_w, stride + (3, 16, 224, 224, 2), # conv1 + (16, 64, 112, 112, 2), + (64, 64, 56, 56, 1), + (64, 64, 56, 56, 1), + (64, 64, 56, 56, 1), + (64, 160, 56, 56, 2), # stride = 2 + (160, 160, 28, 28, 1), + (160, 160, 28, 28, 1), + (160, 160, 28, 28, 1), + (160, 320, 28, 28, 2), # stride = 2 + (320, 320, 14, 14, 1), + (320, 320, 14, 14, 1), + (320, 320, 14, 14, 1), + (320, 320, 14, 14, 1), + (320, 320, 14, 14, 1), + (320, 320, 14, 14, 1), + (320, 320, 14, 14, 1), + (320, 640, 14, 14, 2), # stride = 2 + (640, 640, 7, 7, 1), + (640, 640, 7, 7, 1), + (640, 640, 7, 7, 1), + (640, 1000, 7, 7, 1), # rest_operation +] +blocks_keys = [ + 'shufflenet_3x3', + 'shufflenet_5x5', + 'shufflenet_7x7', + 'xception_3x3', +] + + +def get_cand_flops(cand): + conv1_flops = op_flops_dict['conv1'][(3, 16, 224, 224, 2)] + rest_flops = op_flops_dict['rest_operation'][(640, 1000, 7, 7, 1)] + total_flops = conv1_flops + rest_flops + for i in range(len(cand)): + op_ids = cand[i] + inp, oup, img_h, img_w, stride = backbone_info[i + 1] + key = blocks_keys[op_ids] + '_stride_' + str(stride) + mid = int(oup // 2) + mid = int(mid) + total_flops += op_flops_dict[key][ + (inp, oup, mid, img_h, img_w, stride)] + return total_flops + + +def main(): + for i in range(4): + print(i, get_cand_flops((i,) * 20)) + + +if __name__ == '__main__': + main() diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py new file mode 100644 index 0000000000..8d3ed65b1f --- /dev/null +++ b/examples/nas/spos/network.py @@ -0,0 +1,107 @@ +import torch +import torch.nn as nn + +from blocks import ShuffleNetBlock, ShuffleXceptionBlock + +from nni.nas.pytorch import mutables + + +class ShuffleNetV2OneShot(nn.Module): + + def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=1024, n_classes=1000): + super().__init__() + + assert input_size % 32 == 0 + + self.stage_blocks = [4, 4, 8, 4] + self.stage_channels = [64, 160, 320, 640] + + # building first layer + self.first_conv = nn.Sequential( + nn.Conv2d(3, first_conv_channels, 3, 2, 1, bias=False), + nn.BatchNorm2d(first_conv_channels, affine=False), + nn.ReLU(inplace=True), + ) + + p_channels = first_conv_channels + features = [] + for num_blocks, channels in zip(self.stage_blocks, self.stage_channels): + features.extend(self._make_blocks(num_blocks, p_channels, channels)) + p_channels = channels + self.features = nn.Sequential(*features) + + self.conv_last = nn.Sequential( + nn.Conv2d(p_channels, last_conv_channels, 1, 1, 0, bias=False), + nn.BatchNorm2d(last_conv_channels, affine=False), + nn.ReLU(inplace=True), + ) + self.globalpool = nn.AvgPool2d(7) + self.dropout = nn.Dropout(0.1) + self.classifier = nn.Linear(last_conv_channels, n_classes, bias=False) + + self._initialize_weights() + + def _make_blocks(self, blocks, in_channels, channels): + result = [] + for i in range(blocks): + stride = 2 if i == 0 else 1 + inp = in_channels if i == 0 else channels // 2 + oup = channels + + base_mid_channels = channels // 2 + mid_channels = int(base_mid_channels) # prepare for scale + result.append(mutables.LayerChoice([ + ShuffleNetBlock(inp, oup, mid_channels=mid_channels, ksize=3, stride=stride), + ShuffleNetBlock(inp, oup, mid_channels=mid_channels, ksize=5, stride=stride), + ShuffleNetBlock(inp, oup, mid_channels=mid_channels, ksize=7, stride=stride), + ShuffleXceptionBlock(inp, oup, mid_channels=mid_channels, stride=stride) + ])) + return result + + def forward(self, x): + bs = x.size(0) + x = self.first_conv(x) + x = self.features(x) + x = self.conv_last(x) + x = self.globalpool(x) + + x = self.dropout(x) + x = x.contiguous().view(bs, -1) + x = self.classifier(x) + return x + + def _initialize_weights(self): + for name, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + if 'first' in name: + nn.init.normal_(m.weight, 0, 0.01) + else: + nn.init.normal_(m.weight, 0, 1.0 / m.weight.shape[1]) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + if m.weight is not None: + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0.0001) + nn.init.constant_(m.running_mean, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0.0001) + nn.init.constant_(m.running_mean, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + +if __name__ == "__main__": + # architecture = [0, 0, 3, 1, 1, 1, 0, 0, 2, 0, 2, 1, 1, 0, 2, 0, 2, 1, 3, 2] + # scale_list = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6] + # scale_ids = [6, 5, 3, 5, 2, 6, 3, 4, 2, 5, 7, 5, 4, 6, 7, 4, 4, 5, 4, 3] + model = ShuffleNetV2_OneShot() + # print(model) + + test_data = torch.rand(5, 3, 224, 224) + test_outputs = model(test_data) + print(test_outputs.size()) diff --git a/examples/nas/spos/train.py b/examples/nas/spos/train.py new file mode 100644 index 0000000000..4748c34c1c --- /dev/null +++ b/examples/nas/spos/train.py @@ -0,0 +1,301 @@ +import argparse +import logging +import os +import sys +import time + +import PIL +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torchvision.datasets as datasets +import torchvision.transforms as transforms +from PIL import Image + +from flops import get_cand_flops +from network import ShuffleNetV2_OneShot +from utils import accuracy, AvgrageMeter, CrossEntropyLabelSmooth, save_checkpoint, get_lastest_model, get_parameters + + +class OpencvResize(object): + + def __init__(self, size=256): + self.size = size + + def __call__(self, img): + assert isinstance(img, PIL.Image.Image) + img = np.asarray(img) # (H,W,3) RGB + img = img[:, :, ::-1] # 2 BGR + img = np.ascontiguousarray(img) + H, W, _ = img.shape + target_size = (int(self.size / H * W + 0.5), self.size) if H < W else (self.size, int(self.size / W * H + 0.5)) + img = cv2.resize(img, target_size, interpolation=cv2.INTER_LINEAR) + img = img[:, :, ::-1] # 2 RGB + img = np.ascontiguousarray(img) + img = Image.fromarray(img) + return img + + +class ToBGRTensor(object): + + def __call__(self, img): + assert isinstance(img, (np.ndarray, PIL.Image.Image)) + if isinstance(img, PIL.Image.Image): + img = np.asarray(img) + img = img[:, :, ::-1] # 2 BGR + img = np.transpose(img, [2, 0, 1]) # 2 (3, H, W) + img = np.ascontiguousarray(img) + img = torch.from_numpy(img).float() + return img + + +class DataIterator(object): + + def __init__(self, dataloader): + self.dataloader = dataloader + self.iterator = enumerate(self.dataloader) + + def next(self): + try: + _, data = next(self.iterator) + except Exception: + self.iterator = enumerate(self.dataloader) + _, data = next(self.iterator) + return data[0], data[1] + + +def get_args(): + parser = argparse.ArgumentParser("ShuffleNetV2_OneShot") + parser.add_argument('--eval', default=False, action='store_true') + parser.add_argument('--eval-resume', type=str, default='./snet_detnas.pkl', help='path for eval model') + parser.add_argument('--batch-size', type=int, default=1024, help='batch size') + parser.add_argument('--total-iters', type=int, default=150000, help='total iters') + parser.add_argument('--learning-rate', type=float, default=0.5, help='init learning rate') + parser.add_argument('--momentum', type=float, default=0.9, help='momentum') + parser.add_argument('--weight-decay', type=float, default=4e-5, help='weight decay') + parser.add_argument('--save', type=str, default='./models', help='path for saving trained models') + parser.add_argument('--label-smooth', type=float, default=0.1, help='label smoothing') + + parser.add_argument('--auto-continue', type=bool, default=True, help='report frequency') + parser.add_argument('--display-interval', type=int, default=20, help='report frequency') + parser.add_argument('--val-interval', type=int, default=10000, help='report frequency') + parser.add_argument('--save-interval', type=int, default=10000, help='report frequency') + + parser.add_argument('--train-dir', type=str, default='data/train', help='path to training dataset') + parser.add_argument('--val-dir', type=str, default='data/val', help='path to validation dataset') + + args = parser.parse_args() + return args + + +def main(): + args = get_args() + + # Log + log_format = '[%(asctime)s] %(message)s' + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d %I:%M:%S') + t = time.time() + local_time = time.localtime(t) + if not os.path.exists('./log'): + os.mkdir('./log') + fh = logging.FileHandler( + os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) + fh.setFormatter(logging.Formatter(log_format)) + logging.getLogger().addHandler(fh) + + use_gpu = False + if torch.cuda.is_available(): + use_gpu = True + + assert os.path.exists(args.train_dir) + train_dataset = datasets.ImageFolder( + args.train_dir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), + transforms.RandomHorizontalFlip(0.5), + ToBGRTensor(), + ]) + ) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=True, + num_workers=1, pin_memory=use_gpu) + train_dataprovider = DataIterator(train_loader) + + assert os.path.exists(args.val_dir) + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(args.val_dir, transforms.Compose([ + OpencvResize(256), + transforms.CenterCrop(224), + ToBGRTensor(), + ])), + batch_size=200, shuffle=False, + num_workers=1, pin_memory=use_gpu + ) + val_dataprovider = DataIterator(val_loader) + print('load data successfully') + + model = ShuffleNetV2_OneShot() + + optimizer = torch.optim.SGD(get_parameters(model), + lr=args.learning_rate, + momentum=args.momentum, + weight_decay=args.weight_decay) + criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) + + if use_gpu: + model = nn.DataParallel(model) + loss_function = criterion_smooth.cuda() + device = torch.device("cuda") + else: + loss_function = criterion_smooth + device = torch.device("cpu") + + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, + lambda step: ( + 1.0 - step / args.total_iters) if step <= args.total_iters else 0, + last_epoch=-1) + + model = model.to(device) + + all_iters = 0 + if args.auto_continue: + lastest_model, iters = get_lastest_model() + if lastest_model is not None: + all_iters = iters + checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') + model.load_state_dict(checkpoint['state_dict'], strict=True) + print('load from checkpoint') + for i in range(iters): + scheduler.step() + + args.optimizer = optimizer + args.loss_function = loss_function + args.scheduler = scheduler + args.train_dataprovider = train_dataprovider + args.val_dataprovider = val_dataprovider + + if args.eval: + if args.eval_resume is not None: + checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') + model.load_state_dict(checkpoint, strict=True) + validate(model, device, args, all_iters=all_iters) + exit(0) + + while all_iters < args.total_iters: + all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters) + # all_iters = train(model, device, args, val_interval=int(1280000/args.batch_size), bn_process=True, all_iters=all_iters) + # save_checkpoint({'state_dict': model.state_dict(),}, args.total_iters, tag='bnps-') + + +def adjust_bn_momentum(model, iters): + for m in model.modules(): + if isinstance(m, nn.BatchNorm2d): + m.momentum = 1 / iters + + +def train(model, device, args, *, val_interval, bn_process=False, all_iters=None): + optimizer = args.optimizer + loss_function = args.loss_function + scheduler = args.scheduler + train_dataprovider = args.train_dataprovider + + t1 = time.time() + Top1_err, Top5_err = 0.0, 0.0 + model.train() + for iters in range(1, val_interval + 1): + scheduler.step() + if bn_process: + adjust_bn_momentum(model, iters) + + all_iters += 1 + d_st = time.time() + data, target = train_dataprovider.next() + target = target.type(torch.LongTensor) + data, target = data.to(device), target.to(device) + data_time = time.time() - d_st + + get_random_cand = lambda: tuple(np.random.randint(4) for i in range(20)) + flops_l, flops_r, flops_step = 290, 360, 10 + bins = [[i, i + flops_step] for i in range(flops_l, flops_r, flops_step)] + + def get_uniform_sample_cand(*, timeout=500): + idx = np.random.randint(len(bins)) + l, r = bins[idx] + for i in range(timeout): + cand = get_random_cand() + if l * 1e6 <= get_cand_flops(cand) <= r * 1e6: + return cand + return get_random_cand() + + output = model(data, get_uniform_sample_cand()) + loss = loss_function(output, target) + optimizer.zero_grad() + loss.backward() + + for p in model.parameters(): + if p.grad is not None and p.grad.sum() == 0: + p.grad = None + + optimizer.step() + prec1, prec5 = accuracy(output, target, topk=(1, 5)) + + Top1_err += 1 - prec1.item() / 100 + Top5_err += 1 - prec5.item() / 100 + + if all_iters % args.display_interval == 0: + printInfo = 'TRAIN Iter {}: lr = {:.6f},\tloss = {:.6f},\t'.format(all_iters, scheduler.get_lr()[0], + loss.item()) + \ + 'Top-1 err = {:.6f},\t'.format(Top1_err / args.display_interval) + \ + 'Top-5 err = {:.6f},\t'.format(Top5_err / args.display_interval) + \ + 'data_time = {:.6f},\ttrain_time = {:.6f}'.format(data_time, + (time.time() - t1) / args.display_interval) + logging.info(printInfo) + t1 = time.time() + Top1_err, Top5_err = 0.0, 0.0 + + if all_iters % args.save_interval == 0: + save_checkpoint({ + 'state_dict': model.state_dict(), + }, all_iters) + + return all_iters + + +def validate(model, device, args, *, all_iters=None): + objs = AvgrageMeter() + top1 = AvgrageMeter() + top5 = AvgrageMeter() + + loss_function = args.loss_function + val_dataprovider = args.val_dataprovider + + model.eval() + max_val_iters = 250 + t1 = time.time() + with torch.no_grad(): + for _ in range(1, max_val_iters + 1): + data, target = val_dataprovider.next() + target = target.type(torch.LongTensor) + data, target = data.to(device), target.to(device) + + output = model(data) + loss = loss_function(output, target) + + prec1, prec5 = accuracy(output, target, topk=(1, 5)) + n = data.size(0) + objs.update(loss.item(), n) + top1.update(prec1.item(), n) + top5.update(prec5.item(), n) + + logInfo = 'TEST Iter {}: loss = {:.6f},\t'.format(all_iters, objs.avg) + \ + 'Top-1 err = {:.6f},\t'.format(1 - top1.avg / 100) + \ + 'Top-5 err = {:.6f},\t'.format(1 - top5.avg / 100) + \ + 'val_time = {:.6f}'.format(time.time() - t1) + logging.info(logInfo) + + +if __name__ == "__main__": + main() diff --git a/examples/nas/spos/utils.py b/examples/nas/spos/utils.py new file mode 100644 index 0000000000..4bc1efccfd --- /dev/null +++ b/examples/nas/spos/utils.py @@ -0,0 +1,96 @@ +import os +import re + +import torch +import torch.nn as nn + + +class CrossEntropyLabelSmooth(nn.Module): + + def __init__(self, num_classes, epsilon): + super(CrossEntropyLabelSmooth, self).__init__() + self.num_classes = num_classes + self.epsilon = epsilon + self.logsoftmax = nn.LogSoftmax(dim=1) + + def forward(self, inputs, targets): + log_probs = self.logsoftmax(inputs) + targets = torch.zeros_like(log_probs).scatter_( + 1, targets.unsqueeze(1), 1) + targets = (1 - self.epsilon) * \ + targets + self.epsilon / self.num_classes + loss = (-targets * log_probs).mean(0).sum() + return loss + + +class AvgrageMeter(object): + + def __init__(self): + self.reset() + + def reset(self): + self.avg = 0 + self.sum = 0 + self.cnt = 0 + self.val = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.cnt += n + self.avg = self.sum / self.cnt + + +def accuracy(output, target, topk=(1,)): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def save_checkpoint(state, iters, tag=''): + if not os.path.exists("./models"): + os.makedirs("./models") + filename = os.path.join( + "./models/{}checkpoint-{:06}.pth.tar".format(tag, iters)) + torch.save(state, filename) + latestfilename = os.path.join( + "./models/{}checkpoint-latest.pth.tar".format(tag)) + torch.save(state, latestfilename) + + +def get_lastest_model(): + if not os.path.exists('./models'): + os.mkdir('./models') + model_list = os.listdir('./models/') + if model_list == []: + return None, 0 + model_list.sort() + lastest_model = model_list[-1] + iters = re.findall(r'\d+', lastest_model) + return './models/' + lastest_model, int(iters[0]) + + +def get_parameters(model): + group_no_weight_decay = [] + group_weight_decay = [] + for pname, p in model.named_parameters(): + if pname.find('weight') >= 0 and len(p.size()) > 1: + # print('include ', pname, p.size()) + group_weight_decay.append(p) + else: + # print('not include ', pname, p.size()) + group_no_weight_decay.append(p) + assert len(list(model.parameters())) == len( + group_weight_decay) + len(group_no_weight_decay) + groups = [dict(params=group_weight_decay), dict( + params=group_no_weight_decay, weight_decay=0.)] + return groups From 5363aa4de338af349626489cd8a39596093ba734 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Fri, 29 Nov 2019 15:59:15 +0800 Subject: [PATCH 02/57] checkpoint --- examples/nas/spos/flops.py | 26 ++++++++++++++++++++++++++ examples/nas/spos/train.py | 4 ++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/examples/nas/spos/flops.py b/examples/nas/spos/flops.py index 33f3d9ea8a..a94e2d8932 100644 --- a/examples/nas/spos/flops.py +++ b/examples/nas/spos/flops.py @@ -1,3 +1,6 @@ +import json + + op_flops_dict = pickle.load(open('./data/op_flops_dict.pkl', 'rb')) backbone_info = [ # inp, oup, img_h, img_w, stride (3, 16, 224, 224, 2), # conv1 @@ -46,6 +49,29 @@ def get_cand_flops(cand): return total_flops +class Flops: + def __init__(self, backbones_mutable): + """ + Initialization of flops calculator. + + Parameters + ---------- + backbones_mutable: dict + Mapping mutable keys to backbones: `(inp, oup, img_h, img_w, stride)`. As this is needed for calculating + the exact flops. For example, + + { + "LayerChoice1": (16, 64, 112, 112, 2), + "LayerChoice2": (64, 64, 56, 56, 1), + } + """ + with open("/data/op_flops_dict.pkl", "rb") as fp: + self.op_flops_dict = json.load(fp) + self.backbones = backbones + + def __call__(self, decision_map): + pass + def main(): for i in range(4): print(i, get_cand_flops((i,) * 20)) diff --git a/examples/nas/spos/train.py b/examples/nas/spos/train.py index 4748c34c1c..b7e80dacdc 100644 --- a/examples/nas/spos/train.py +++ b/examples/nas/spos/train.py @@ -154,8 +154,8 @@ def main(): device = torch.device("cpu") scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, - lambda step: ( - 1.0 - step / args.total_iters) if step <= args.total_iters else 0, + lambda step: (1.0 - step / args.total_iters) + if step <= args.total_iters else 0, last_epoch=-1) model = model.to(device) From a4fc9cce537ce0067e0f3255b79ed26c521e6302 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Fri, 29 Nov 2019 21:47:52 +0800 Subject: [PATCH 03/57] checkpoint --- examples/nas/spos/blocks.py | 17 ++++---- examples/nas/spos/flops.py | 81 ------------------------------------ examples/nas/spos/network.py | 46 +++++++++++++++++--- examples/nas/spos/readme.md | 8 ++++ 4 files changed, 57 insertions(+), 95 deletions(-) delete mode 100644 examples/nas/spos/flops.py create mode 100644 examples/nas/spos/readme.md diff --git a/examples/nas/spos/blocks.py b/examples/nas/spos/blocks.py index 883ceb4d95..87bcc4789e 100644 --- a/examples/nas/spos/blocks.py +++ b/examples/nas/spos/blocks.py @@ -11,26 +11,27 @@ def __init__(self, inp, oup, mid_channels, ksize, stride, sequence="pdp"): super().__init__() assert stride in [1, 2] assert ksize in [3, 5, 7] - assert oup > inp - + self.channels = inp // 2 if stride == 1 else inp self.inp = inp self.oup = oup self.mid_channels = mid_channels self.ksize = ksize self.stride = stride self.pad = ksize // 2 - self.oup_main = oup - inp + self.oup_main = oup - self.channels + assert self.oup_main > 0 self.branch_main = nn.Sequential(*self._decode_point_depth_conv(sequence)) if stride == 2: self.branch_proj = nn.Sequential( # dw - nn.Conv2d(inp, inp, ksize, stride, self.pad, groups=inp, bias=False), - nn.BatchNorm2d(inp, affine=False), + nn.Conv2d(self.channels, self.channels, ksize, stride, self.pad, + groups=self.channels, bias=False), + nn.BatchNorm2d(self.channels, affine=False), # pw-linear - nn.Conv2d(inp, inp, 1, 1, 0, bias=False), - nn.BatchNorm2d(inp, affine=False), + nn.Conv2d(self.channels, self.channels, 1, 1, 0, bias=False), + nn.BatchNorm2d(self.channels, affine=False), nn.ReLU(inplace=True) ) @@ -44,7 +45,7 @@ def forward(self, x): def _decode_point_depth_conv(self, sequence): result = [] first_depth = first_point = True - pc = c = self.inp + pc = c = self.channels for i, token in enumerate(sequence): # compute output channels of this conv if i + 1 == len(sequence): diff --git a/examples/nas/spos/flops.py b/examples/nas/spos/flops.py deleted file mode 100644 index a94e2d8932..0000000000 --- a/examples/nas/spos/flops.py +++ /dev/null @@ -1,81 +0,0 @@ -import json - - -op_flops_dict = pickle.load(open('./data/op_flops_dict.pkl', 'rb')) -backbone_info = [ # inp, oup, img_h, img_w, stride - (3, 16, 224, 224, 2), # conv1 - (16, 64, 112, 112, 2), - (64, 64, 56, 56, 1), - (64, 64, 56, 56, 1), - (64, 64, 56, 56, 1), - (64, 160, 56, 56, 2), # stride = 2 - (160, 160, 28, 28, 1), - (160, 160, 28, 28, 1), - (160, 160, 28, 28, 1), - (160, 320, 28, 28, 2), # stride = 2 - (320, 320, 14, 14, 1), - (320, 320, 14, 14, 1), - (320, 320, 14, 14, 1), - (320, 320, 14, 14, 1), - (320, 320, 14, 14, 1), - (320, 320, 14, 14, 1), - (320, 320, 14, 14, 1), - (320, 640, 14, 14, 2), # stride = 2 - (640, 640, 7, 7, 1), - (640, 640, 7, 7, 1), - (640, 640, 7, 7, 1), - (640, 1000, 7, 7, 1), # rest_operation -] -blocks_keys = [ - 'shufflenet_3x3', - 'shufflenet_5x5', - 'shufflenet_7x7', - 'xception_3x3', -] - - -def get_cand_flops(cand): - conv1_flops = op_flops_dict['conv1'][(3, 16, 224, 224, 2)] - rest_flops = op_flops_dict['rest_operation'][(640, 1000, 7, 7, 1)] - total_flops = conv1_flops + rest_flops - for i in range(len(cand)): - op_ids = cand[i] - inp, oup, img_h, img_w, stride = backbone_info[i + 1] - key = blocks_keys[op_ids] + '_stride_' + str(stride) - mid = int(oup // 2) - mid = int(mid) - total_flops += op_flops_dict[key][ - (inp, oup, mid, img_h, img_w, stride)] - return total_flops - - -class Flops: - def __init__(self, backbones_mutable): - """ - Initialization of flops calculator. - - Parameters - ---------- - backbones_mutable: dict - Mapping mutable keys to backbones: `(inp, oup, img_h, img_w, stride)`. As this is needed for calculating - the exact flops. For example, - - { - "LayerChoice1": (16, 64, 112, 112, 2), - "LayerChoice2": (64, 64, 56, 56, 1), - } - """ - with open("/data/op_flops_dict.pkl", "rb") as fp: - self.op_flops_dict = json.load(fp) - self.backbones = backbones - - def __call__(self, decision_map): - pass - -def main(): - for i in range(4): - print(i, get_cand_flops((i,) * 20)) - - -if __name__ == '__main__': - main() diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 8d3ed65b1f..e0baf192b4 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -1,3 +1,5 @@ +import pickle + import torch import torch.nn as nn @@ -7,14 +9,27 @@ class ShuffleNetV2OneShot(nn.Module): + block_keys = [ + 'shufflenet_3x3', + 'shufflenet_5x5', + 'shufflenet_7x7', + 'xception_3x3', + ] def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=1024, n_classes=1000): super().__init__() assert input_size % 32 == 0 + with open("./data/op_flops_dict.pkl", "rb") as fp: + self._op_flops_dict = pickle.load(fp) self.stage_blocks = [4, 4, 8, 4] self.stage_channels = [64, 160, 320, 640] + self._parsed_flops = dict() + self._input_size = input_size + self._feature_map_size = input_size + self._first_conv_channels = first_conv_channels + self._last_conv_channels = last_conv_channels # building first layer self.first_conv = nn.Sequential( @@ -35,7 +50,7 @@ def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=10 nn.BatchNorm2d(last_conv_channels, affine=False), nn.ReLU(inplace=True), ) - self.globalpool = nn.AvgPool2d(7) + self.globalpool = nn.AvgPool2d(self._feature_map_size) self.dropout = nn.Dropout(0.1) self.classifier = nn.Linear(last_conv_channels, n_classes, bias=False) @@ -45,17 +60,26 @@ def _make_blocks(self, blocks, in_channels, channels): result = [] for i in range(blocks): stride = 2 if i == 0 else 1 - inp = in_channels if i == 0 else channels // 2 + inp = in_channels if i == 0 else channels oup = channels base_mid_channels = channels // 2 mid_channels = int(base_mid_channels) # prepare for scale - result.append(mutables.LayerChoice([ + choice_block = mutables.LayerChoice([ ShuffleNetBlock(inp, oup, mid_channels=mid_channels, ksize=3, stride=stride), ShuffleNetBlock(inp, oup, mid_channels=mid_channels, ksize=5, stride=stride), ShuffleNetBlock(inp, oup, mid_channels=mid_channels, ksize=7, stride=stride), ShuffleXceptionBlock(inp, oup, mid_channels=mid_channels, stride=stride) - ])) + ]) + result.append(choice_block) + + # find the corresponding flops + flop_key = (inp, oup, mid_channels, self._feature_map_size, self._feature_map_size, stride) + self._parsed_flops[choice_block.key] = [ + self._op_flops_dict["{}_stride_{}".format(k, stride)][flop_key] for k in self.block_keys + ] + if stride == 2: + self._feature_map_size //= 2 return result def forward(self, x): @@ -70,6 +94,16 @@ def forward(self, x): x = self.classifier(x) return x + def get_candidate_flops(self, candidate): + conv1_flops = self._op_flops_dict['conv1'][(3, self._first_conv_channels, + self._input_size, self._input_size, 2)] + rest_flops = self._op_flops_dict['rest_operation'][(self.stage_channels[-1], self._last_conv_channels, + self._feature_map_size, self._feature_map_size, 1)] + total_flops = conv1_flops + rest_flops + for k, m in candidate.items(): + total_flops += self._parsed_flops[k][torch.max(m)[1]] + return total_flops + def _initialize_weights(self): for name, m in self.named_modules(): if isinstance(m, nn.Conv2d): @@ -95,12 +129,12 @@ def _initialize_weights(self): if m.bias is not None: nn.init.constant_(m.bias, 0) + if __name__ == "__main__": # architecture = [0, 0, 3, 1, 1, 1, 0, 0, 2, 0, 2, 1, 1, 0, 2, 0, 2, 1, 3, 2] # scale_list = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6] # scale_ids = [6, 5, 3, 5, 2, 6, 3, 4, 2, 5, 7, 5, 4, 6, 7, 4, 4, 5, 4, 3] - model = ShuffleNetV2_OneShot() - # print(model) + model = ShuffleNetV2OneShot() test_data = torch.rand(5, 3, 224, 224) test_outputs = model(test_data) diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/readme.md new file mode 100644 index 0000000000..1b7c4a8924 --- /dev/null +++ b/examples/nas/spos/readme.md @@ -0,0 +1,8 @@ +# Single Path One-Shot + +Single Path One-Shot by Megvii Research. + +## Preparation + +Need to download the flops lookup table from [here](https://1drv.ms/u/s!Am_mmG2-KsrnajesvSdfsq_cN48?e=aHVppN). +Put `op_flops_dict.pkl` under `data` directory. From 9bc24b0d936253aaec43635cc2f6eb17fbaae8d9 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sun, 1 Dec 2019 15:56:41 +0000 Subject: [PATCH 04/57] checkpoint --- examples/nas/spos/blocks.py | 2 +- examples/nas/spos/network.py | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/examples/nas/spos/blocks.py b/examples/nas/spos/blocks.py index 87bcc4789e..acc9f996ab 100644 --- a/examples/nas/spos/blocks.py +++ b/examples/nas/spos/blocks.py @@ -83,4 +83,4 @@ def _channel_shuffle(self, x): class ShuffleXceptionBlock(ShuffleNetBlock): def __init__(self, inp, oup, mid_channels, stride): - super().__init__(inp, oup, mid_channels, 3, 1, "dpdpdp") + super().__init__(inp, oup, mid_channels, 3, stride, "dpdpdp") diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index e0baf192b4..11ffd54784 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -1,4 +1,5 @@ import pickle +import re import torch import torch.nn as nn @@ -6,6 +7,7 @@ from blocks import ShuffleNetBlock, ShuffleXceptionBlock from nni.nas.pytorch import mutables +from nni.nas.pytorch.random import RandomMutator class ShuffleNetV2OneShot(nn.Module): @@ -37,6 +39,7 @@ def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=10 nn.BatchNorm2d(first_conv_channels, affine=False), nn.ReLU(inplace=True), ) + self._feature_map_size //= 2 p_channels = first_conv_channels features = [] @@ -52,7 +55,9 @@ def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=10 ) self.globalpool = nn.AvgPool2d(self._feature_map_size) self.dropout = nn.Dropout(0.1) - self.classifier = nn.Linear(last_conv_channels, n_classes, bias=False) + self.classifier = nn.Sequential( + nn.Linear(last_conv_channels, n_classes, bias=False), + ) self._initialize_weights() @@ -75,6 +80,7 @@ def _make_blocks(self, blocks, in_channels, channels): # find the corresponding flops flop_key = (inp, oup, mid_channels, self._feature_map_size, self._feature_map_size, stride) + print(flop_key) self._parsed_flops[choice_block.key] = [ self._op_flops_dict["{}_stride_{}".format(k, stride)][flop_key] for k in self.block_keys ] @@ -95,6 +101,8 @@ def forward(self, x): return x def get_candidate_flops(self, candidate): + print((3, self._first_conv_channels, self._input_size, self._input_size, 2)) + print((self.stage_channels[-1], self._last_conv_channels, self._feature_map_size, self._feature_map_size, 1)) conv1_flops = self._op_flops_dict['conv1'][(3, self._first_conv_channels, self._input_size, self._input_size, 2)] rest_flops = self._op_flops_dict['rest_operation'][(self.stage_channels[-1], self._last_conv_channels, @@ -130,12 +138,27 @@ def _initialize_weights(self): nn.init.constant_(m.bias, 0) +def load_and_parse_state_dict(): + checkpoint = torch.load("./data/checkpoint-150000.pth.tar") + result = dict() + for k, v in checkpoint["state_dict"].items(): + if k.startswith("module."): + k = k[len("module."):] + k = re.sub(r"^(features.\d+).(\d+)", "\\1.choices.\\2", k) + result[k] = v + return result + + if __name__ == "__main__": # architecture = [0, 0, 3, 1, 1, 1, 0, 0, 2, 0, 2, 1, 1, 0, 2, 0, 2, 1, 3, 2] # scale_list = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6] # scale_ids = [6, 5, 3, 5, 2, 6, 3, 4, 2, 5, 7, 5, 4, 6, 7, 4, 4, 5, 4, 3] model = ShuffleNetV2OneShot() + mutator = RandomMutator(model) + model_state_dict = load_and_parse_state_dict() + model.load_state_dict(model_state_dict) test_data = torch.rand(5, 3, 224, 224) + mutator.reset() test_outputs = model(test_data) print(test_outputs.size()) From 7b6ff0dabce0eb40e67e2d9b12a1a6db3088fc5e Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sun, 1 Dec 2019 16:20:30 +0000 Subject: [PATCH 05/57] checkpoint --- examples/nas/spos/network.py | 17 +++++----- .../pynni/nni/nas/pytorch/spos/__init__.py | 1 + src/sdk/pynni/nni/nas/pytorch/spos/mutator.py | 32 +++++++++++++++++++ 3 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 src/sdk/pynni/nni/nas/pytorch/spos/__init__.py create mode 100644 src/sdk/pynni/nni/nas/pytorch/spos/mutator.py diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 11ffd54784..0d5dedee96 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -7,7 +7,7 @@ from blocks import ShuffleNetBlock, ShuffleXceptionBlock from nni.nas.pytorch import mutables -from nni.nas.pytorch.random import RandomMutator +from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator class ShuffleNetV2OneShot(nn.Module): @@ -32,6 +32,7 @@ def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=10 self._feature_map_size = input_size self._first_conv_channels = first_conv_channels self._last_conv_channels = last_conv_channels + self._n_classes = n_classes # building first layer self.first_conv = nn.Sequential( @@ -80,7 +81,6 @@ def _make_blocks(self, blocks, in_channels, channels): # find the corresponding flops flop_key = (inp, oup, mid_channels, self._feature_map_size, self._feature_map_size, stride) - print(flop_key) self._parsed_flops[choice_block.key] = [ self._op_flops_dict["{}_stride_{}".format(k, stride)][flop_key] for k in self.block_keys ] @@ -101,15 +101,15 @@ def forward(self, x): return x def get_candidate_flops(self, candidate): - print((3, self._first_conv_channels, self._input_size, self._input_size, 2)) - print((self.stage_channels[-1], self._last_conv_channels, self._feature_map_size, self._feature_map_size, 1)) - conv1_flops = self._op_flops_dict['conv1'][(3, self._first_conv_channels, + conv1_flops = self._op_flops_dict["conv1"][(3, self._first_conv_channels, self._input_size, self._input_size, 2)] - rest_flops = self._op_flops_dict['rest_operation'][(self.stage_channels[-1], self._last_conv_channels, + # Should use `last_conv_channels` here, but megvii insists that it's `n_classes`. Keeping it. + # https://github.com/megvii-model/SinglePathOneShot/blob/36eed6cf083497ffa9cfe7b8da25bb0b6ba5a452/src/Supernet/flops.py#L313 + rest_flops = self._op_flops_dict["rest_operation"][(self.stage_channels[-1], self._n_classes, self._feature_map_size, self._feature_map_size, 1)] total_flops = conv1_flops + rest_flops for k, m in candidate.items(): - total_flops += self._parsed_flops[k][torch.max(m)[1]] + total_flops += self._parsed_flops[k][torch.max(m, 0)[1]] return total_flops def _initialize_weights(self): @@ -154,7 +154,8 @@ def load_and_parse_state_dict(): # scale_list = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6] # scale_ids = [6, 5, 3, 5, 2, 6, 3, 4, 2, 5, 7, 5, 4, 6, 7, 4, 4, 5, 4, 3] model = ShuffleNetV2OneShot() - mutator = RandomMutator(model) + mutator = SPOSSupernetTrainingMutator(model, flops_func=model.get_candidate_flops, + flops_lb=290E6, flops_ub=360E6) model_state_dict = load_and_parse_state_dict() model.load_state_dict(model_state_dict) diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py b/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py new file mode 100644 index 0000000000..16f83d4b14 --- /dev/null +++ b/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py @@ -0,0 +1 @@ +from .mutator import SPOSSupernetTrainingMutator diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py new file mode 100644 index 0000000000..ceede542dc --- /dev/null +++ b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py @@ -0,0 +1,32 @@ +import logging + +import numpy as np +import torch +import torch.nn.functional as F + +from nni.nas.pytorch.random import RandomMutator + +_logger = logging.getLogger(__name__) + + +class SPOSSupernetTrainingMutator(RandomMutator): + def __init__(self, model, flops_func, flops_lb, flops_ub, + flops_bin_num=7, flops_sample_timeout=500): + super().__init__(model) + self._flops_func = flops_func + self._flops_bin_num = flops_bin_num + self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num)] + self._flops_sample_timeout = flops_sample_timeout + + def sample_search(self): + for _ in range(self._flops_sample_timeout): + idx = np.random.randint(self._flops_bin_num) + cand = super().sample_search() + if self._flops_bins[idx] <= self._flops_func(cand) <= self._flops_bins[idx + 1]: + _logger.debug("Sampled candidate flops %f.", cand) + return cand + _logger.warning("Failed to sample a flops-valid candidate within %d tries.", self._flops_sample_timeout) + return super().sample_search() + + def sample_final(self): + return self.sample_search() From a412ac91ce798368974d7be445a208df8edc393d Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Mon, 2 Dec 2019 12:43:53 +0800 Subject: [PATCH 06/57] checkpoint --- examples/nas/spos/train.py | 301 ------------------ examples/nas/spos/train_supernet.py | 45 +++ examples/nas/spos/utils.py | 98 ++---- .../pynni/nni/nas/pytorch/spos/__init__.py | 4 + src/sdk/pynni/nni/nas/pytorch/spos/mutator.py | 30 +- src/sdk/pynni/nni/nas/pytorch/spos/trainer.py | 61 ++++ 6 files changed, 156 insertions(+), 383 deletions(-) delete mode 100644 examples/nas/spos/train.py create mode 100644 examples/nas/spos/train_supernet.py create mode 100644 src/sdk/pynni/nni/nas/pytorch/spos/trainer.py diff --git a/examples/nas/spos/train.py b/examples/nas/spos/train.py deleted file mode 100644 index b7e80dacdc..0000000000 --- a/examples/nas/spos/train.py +++ /dev/null @@ -1,301 +0,0 @@ -import argparse -import logging -import os -import sys -import time - -import PIL -import cv2 -import numpy as np -import torch -import torch.nn as nn -import torchvision.datasets as datasets -import torchvision.transforms as transforms -from PIL import Image - -from flops import get_cand_flops -from network import ShuffleNetV2_OneShot -from utils import accuracy, AvgrageMeter, CrossEntropyLabelSmooth, save_checkpoint, get_lastest_model, get_parameters - - -class OpencvResize(object): - - def __init__(self, size=256): - self.size = size - - def __call__(self, img): - assert isinstance(img, PIL.Image.Image) - img = np.asarray(img) # (H,W,3) RGB - img = img[:, :, ::-1] # 2 BGR - img = np.ascontiguousarray(img) - H, W, _ = img.shape - target_size = (int(self.size / H * W + 0.5), self.size) if H < W else (self.size, int(self.size / W * H + 0.5)) - img = cv2.resize(img, target_size, interpolation=cv2.INTER_LINEAR) - img = img[:, :, ::-1] # 2 RGB - img = np.ascontiguousarray(img) - img = Image.fromarray(img) - return img - - -class ToBGRTensor(object): - - def __call__(self, img): - assert isinstance(img, (np.ndarray, PIL.Image.Image)) - if isinstance(img, PIL.Image.Image): - img = np.asarray(img) - img = img[:, :, ::-1] # 2 BGR - img = np.transpose(img, [2, 0, 1]) # 2 (3, H, W) - img = np.ascontiguousarray(img) - img = torch.from_numpy(img).float() - return img - - -class DataIterator(object): - - def __init__(self, dataloader): - self.dataloader = dataloader - self.iterator = enumerate(self.dataloader) - - def next(self): - try: - _, data = next(self.iterator) - except Exception: - self.iterator = enumerate(self.dataloader) - _, data = next(self.iterator) - return data[0], data[1] - - -def get_args(): - parser = argparse.ArgumentParser("ShuffleNetV2_OneShot") - parser.add_argument('--eval', default=False, action='store_true') - parser.add_argument('--eval-resume', type=str, default='./snet_detnas.pkl', help='path for eval model') - parser.add_argument('--batch-size', type=int, default=1024, help='batch size') - parser.add_argument('--total-iters', type=int, default=150000, help='total iters') - parser.add_argument('--learning-rate', type=float, default=0.5, help='init learning rate') - parser.add_argument('--momentum', type=float, default=0.9, help='momentum') - parser.add_argument('--weight-decay', type=float, default=4e-5, help='weight decay') - parser.add_argument('--save', type=str, default='./models', help='path for saving trained models') - parser.add_argument('--label-smooth', type=float, default=0.1, help='label smoothing') - - parser.add_argument('--auto-continue', type=bool, default=True, help='report frequency') - parser.add_argument('--display-interval', type=int, default=20, help='report frequency') - parser.add_argument('--val-interval', type=int, default=10000, help='report frequency') - parser.add_argument('--save-interval', type=int, default=10000, help='report frequency') - - parser.add_argument('--train-dir', type=str, default='data/train', help='path to training dataset') - parser.add_argument('--val-dir', type=str, default='data/val', help='path to validation dataset') - - args = parser.parse_args() - return args - - -def main(): - args = get_args() - - # Log - log_format = '[%(asctime)s] %(message)s' - logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt='%d %I:%M:%S') - t = time.time() - local_time = time.localtime(t) - if not os.path.exists('./log'): - os.mkdir('./log') - fh = logging.FileHandler( - os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) - fh.setFormatter(logging.Formatter(log_format)) - logging.getLogger().addHandler(fh) - - use_gpu = False - if torch.cuda.is_available(): - use_gpu = True - - assert os.path.exists(args.train_dir) - train_dataset = datasets.ImageFolder( - args.train_dir, - transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), - transforms.RandomHorizontalFlip(0.5), - ToBGRTensor(), - ]) - ) - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, shuffle=True, - num_workers=1, pin_memory=use_gpu) - train_dataprovider = DataIterator(train_loader) - - assert os.path.exists(args.val_dir) - val_loader = torch.utils.data.DataLoader( - datasets.ImageFolder(args.val_dir, transforms.Compose([ - OpencvResize(256), - transforms.CenterCrop(224), - ToBGRTensor(), - ])), - batch_size=200, shuffle=False, - num_workers=1, pin_memory=use_gpu - ) - val_dataprovider = DataIterator(val_loader) - print('load data successfully') - - model = ShuffleNetV2_OneShot() - - optimizer = torch.optim.SGD(get_parameters(model), - lr=args.learning_rate, - momentum=args.momentum, - weight_decay=args.weight_decay) - criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) - - if use_gpu: - model = nn.DataParallel(model) - loss_function = criterion_smooth.cuda() - device = torch.device("cuda") - else: - loss_function = criterion_smooth - device = torch.device("cpu") - - scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, - lambda step: (1.0 - step / args.total_iters) - if step <= args.total_iters else 0, - last_epoch=-1) - - model = model.to(device) - - all_iters = 0 - if args.auto_continue: - lastest_model, iters = get_lastest_model() - if lastest_model is not None: - all_iters = iters - checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') - model.load_state_dict(checkpoint['state_dict'], strict=True) - print('load from checkpoint') - for i in range(iters): - scheduler.step() - - args.optimizer = optimizer - args.loss_function = loss_function - args.scheduler = scheduler - args.train_dataprovider = train_dataprovider - args.val_dataprovider = val_dataprovider - - if args.eval: - if args.eval_resume is not None: - checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') - model.load_state_dict(checkpoint, strict=True) - validate(model, device, args, all_iters=all_iters) - exit(0) - - while all_iters < args.total_iters: - all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters) - # all_iters = train(model, device, args, val_interval=int(1280000/args.batch_size), bn_process=True, all_iters=all_iters) - # save_checkpoint({'state_dict': model.state_dict(),}, args.total_iters, tag='bnps-') - - -def adjust_bn_momentum(model, iters): - for m in model.modules(): - if isinstance(m, nn.BatchNorm2d): - m.momentum = 1 / iters - - -def train(model, device, args, *, val_interval, bn_process=False, all_iters=None): - optimizer = args.optimizer - loss_function = args.loss_function - scheduler = args.scheduler - train_dataprovider = args.train_dataprovider - - t1 = time.time() - Top1_err, Top5_err = 0.0, 0.0 - model.train() - for iters in range(1, val_interval + 1): - scheduler.step() - if bn_process: - adjust_bn_momentum(model, iters) - - all_iters += 1 - d_st = time.time() - data, target = train_dataprovider.next() - target = target.type(torch.LongTensor) - data, target = data.to(device), target.to(device) - data_time = time.time() - d_st - - get_random_cand = lambda: tuple(np.random.randint(4) for i in range(20)) - flops_l, flops_r, flops_step = 290, 360, 10 - bins = [[i, i + flops_step] for i in range(flops_l, flops_r, flops_step)] - - def get_uniform_sample_cand(*, timeout=500): - idx = np.random.randint(len(bins)) - l, r = bins[idx] - for i in range(timeout): - cand = get_random_cand() - if l * 1e6 <= get_cand_flops(cand) <= r * 1e6: - return cand - return get_random_cand() - - output = model(data, get_uniform_sample_cand()) - loss = loss_function(output, target) - optimizer.zero_grad() - loss.backward() - - for p in model.parameters(): - if p.grad is not None and p.grad.sum() == 0: - p.grad = None - - optimizer.step() - prec1, prec5 = accuracy(output, target, topk=(1, 5)) - - Top1_err += 1 - prec1.item() / 100 - Top5_err += 1 - prec5.item() / 100 - - if all_iters % args.display_interval == 0: - printInfo = 'TRAIN Iter {}: lr = {:.6f},\tloss = {:.6f},\t'.format(all_iters, scheduler.get_lr()[0], - loss.item()) + \ - 'Top-1 err = {:.6f},\t'.format(Top1_err / args.display_interval) + \ - 'Top-5 err = {:.6f},\t'.format(Top5_err / args.display_interval) + \ - 'data_time = {:.6f},\ttrain_time = {:.6f}'.format(data_time, - (time.time() - t1) / args.display_interval) - logging.info(printInfo) - t1 = time.time() - Top1_err, Top5_err = 0.0, 0.0 - - if all_iters % args.save_interval == 0: - save_checkpoint({ - 'state_dict': model.state_dict(), - }, all_iters) - - return all_iters - - -def validate(model, device, args, *, all_iters=None): - objs = AvgrageMeter() - top1 = AvgrageMeter() - top5 = AvgrageMeter() - - loss_function = args.loss_function - val_dataprovider = args.val_dataprovider - - model.eval() - max_val_iters = 250 - t1 = time.time() - with torch.no_grad(): - for _ in range(1, max_val_iters + 1): - data, target = val_dataprovider.next() - target = target.type(torch.LongTensor) - data, target = data.to(device), target.to(device) - - output = model(data) - loss = loss_function(output, target) - - prec1, prec5 = accuracy(output, target, topk=(1, 5)) - n = data.size(0) - objs.update(loss.item(), n) - top1.update(prec1.item(), n) - top5.update(prec5.item(), n) - - logInfo = 'TEST Iter {}: loss = {:.6f},\t'.format(all_iters, objs.avg) + \ - 'Top-1 err = {:.6f},\t'.format(1 - top1.avg / 100) + \ - 'Top-5 err = {:.6f},\t'.format(1 - top5.avg / 100) + \ - 'val_time = {:.6f}'.format(time.time() - t1) - logging.info(logInfo) - - -if __name__ == "__main__": - main() diff --git a/examples/nas/spos/train_supernet.py b/examples/nas/spos/train_supernet.py new file mode 100644 index 0000000000..e4f3757f2b --- /dev/null +++ b/examples/nas/spos/train_supernet.py @@ -0,0 +1,45 @@ +import argparse + +import torch +import torch.nn as nn +from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback + +from network import ShuffleNetV2OneShot +from src.sdk.pynni.nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer +from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy + + +class AdjustBNMomentum(Callback): + def on_epoch_begin(self, epoch): + for m in self.model.modules(): + if isinstance(m, nn.BatchNorm2d): + m.momentum = 1 / (epoch + 1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("SPOS Supernet Training") + parser.add_argument("--imagenet-dir", type=int, default="./data/imagenet") + parser.add_argument("--batch-size", type=int, default=1024) + parser.add_argument("--epochs", type=int, default=15) + parser.add_argument("--learning-rate", type=float, default=0.5) + parser.add_argument("--momentum", type=float, default=0.9) + parser.add_argument("--weight-decay", type=float, default=4E-5) + parser.add_argument("--label-smooth", type=float, default=0.1) + + args = parser.parse_args() + dataset_train, dataset_valid = get_imagenet(args.imagenet_dir) + model = ShuffleNetV2OneShot() + model = nn.DataParallel(model) + mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, + flops_lb=290E6, flops_ub=360E6) + criterion = CrossEntropyLabelSmooth(1000, 0.1) + optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, + momentum=args.momentum, weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, + lambda step: (1.0 - step / args.epochs) + if step <= args.total_iters else 0, + last_epoch=-1) + trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, + args.num_epochs, dataset_train, dataset_valid, + mutator=mutator, batch_size=args.batch_size, + callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum()]) diff --git a/examples/nas/spos/utils.py b/examples/nas/spos/utils.py index 4bc1efccfd..8f42460f28 100644 --- a/examples/nas/spos/utils.py +++ b/examples/nas/spos/utils.py @@ -1,9 +1,26 @@ -import os -import re - import torch import torch.nn as nn +from torchvision import transforms +from torchvision.datasets import ImageNet + + +def get_imagenet(imagenet_root): + train_transform = transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), + transforms.RandomHorizontalFlip(0.5), + transforms.ToTensor(), + ]) + valid_transform = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + ]) + train_dataset = ImageNet(imagenet_root, split="train", transform=train_transform) + valid_dataset = ImageNet(imagenet_root, split="val", transform=valid_transform) + return train_dataset, valid_dataset + class CrossEntropyLabelSmooth(nn.Module): @@ -15,82 +32,27 @@ def __init__(self, num_classes, epsilon): def forward(self, inputs, targets): log_probs = self.logsoftmax(inputs) - targets = torch.zeros_like(log_probs).scatter_( - 1, targets.unsqueeze(1), 1) - targets = (1 - self.epsilon) * \ - targets + self.epsilon / self.num_classes + targets = torch.zeros_like(log_probs).scatter_(1, targets.unsqueeze(1), 1) + targets = (1 - self.epsilon) * targets + self.epsilon / self.num_classes loss = (-targets * log_probs).mean(0).sum() return loss -class AvgrageMeter(object): - - def __init__(self): - self.reset() - - def reset(self): - self.avg = 0 - self.sum = 0 - self.cnt = 0 - self.val = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.cnt += n - self.avg = self.sum / self.cnt - - -def accuracy(output, target, topk=(1,)): +def accuracy(output, target, topk=(1, 5)): + """ Computes the precision@k for the specified values of k """ maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() + # one-hot case + if target.ndimension() > 1: + target = target.max(1)[1] + correct = pred.eq(target.view(1, -1).expand_as(pred)) - res = [] + res = dict() for k in topk: correct_k = correct[:k].view(-1).float().sum(0) - res.append(correct_k.mul_(100.0 / batch_size)) + res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item() return res - - -def save_checkpoint(state, iters, tag=''): - if not os.path.exists("./models"): - os.makedirs("./models") - filename = os.path.join( - "./models/{}checkpoint-{:06}.pth.tar".format(tag, iters)) - torch.save(state, filename) - latestfilename = os.path.join( - "./models/{}checkpoint-latest.pth.tar".format(tag)) - torch.save(state, latestfilename) - - -def get_lastest_model(): - if not os.path.exists('./models'): - os.mkdir('./models') - model_list = os.listdir('./models/') - if model_list == []: - return None, 0 - model_list.sort() - lastest_model = model_list[-1] - iters = re.findall(r'\d+', lastest_model) - return './models/' + lastest_model, int(iters[0]) - - -def get_parameters(model): - group_no_weight_decay = [] - group_weight_decay = [] - for pname, p in model.named_parameters(): - if pname.find('weight') >= 0 and len(p.size()) > 1: - # print('include ', pname, p.size()) - group_weight_decay.append(p) - else: - # print('not include ', pname, p.size()) - group_no_weight_decay.append(p) - assert len(list(model.parameters())) == len( - group_weight_decay) + len(group_no_weight_decay) - groups = [dict(params=group_weight_decay), dict( - params=group_no_weight_decay, weight_decay=0.)] - return groups diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py b/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py index 16f83d4b14..dc43892384 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py @@ -1 +1,5 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from .mutator import SPOSSupernetTrainingMutator +from .trainer import SPOSSupernetTrainer diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py index ceede542dc..f7d0e6d71f 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py @@ -1,31 +1,33 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import logging import numpy as np -import torch -import torch.nn.functional as F - from nni.nas.pytorch.random import RandomMutator _logger = logging.getLogger(__name__) class SPOSSupernetTrainingMutator(RandomMutator): - def __init__(self, model, flops_func, flops_lb, flops_ub, + def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None, flops_bin_num=7, flops_sample_timeout=500): super().__init__(model) self._flops_func = flops_func - self._flops_bin_num = flops_bin_num - self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num)] - self._flops_sample_timeout = flops_sample_timeout + if self._flops_func is not None: + self._flops_bin_num = flops_bin_num + self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num)] + self._flops_sample_timeout = flops_sample_timeout def sample_search(self): - for _ in range(self._flops_sample_timeout): - idx = np.random.randint(self._flops_bin_num) - cand = super().sample_search() - if self._flops_bins[idx] <= self._flops_func(cand) <= self._flops_bins[idx + 1]: - _logger.debug("Sampled candidate flops %f.", cand) - return cand - _logger.warning("Failed to sample a flops-valid candidate within %d tries.", self._flops_sample_timeout) + if self._flops_func is not None: + for times in range(self._flops_sample_timeout): + idx = np.random.randint(self._flops_bin_num) + cand = super().sample_search() + if self._flops_bins[idx] <= self._flops_func(cand) <= self._flops_bins[idx + 1]: + _logger.debug("Sampled candidate flops %f in %d times.", cand, times) + return cand + _logger.warning("Failed to sample a flops-valid candidate within %d tries.", self._flops_sample_timeout) return super().sample_search() def sample_final(self): diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py new file mode 100644 index 0000000000..e90726c4dd --- /dev/null +++ b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py @@ -0,0 +1,61 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import logging + +import torch +from nni.nas.pytorch.trainer import Trainer +from nni.nas.pytorch.utils import AverageMeterGroup + +from .mutator import SPOSSupernetTrainingMutator + +logger = logging.getLogger(__name__) + + +class SPOSSupernetTrainer(Trainer): + def __init__(self, model, loss, metrics, + optimizer, num_epochs, dataset_train, dataset_valid, + mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, + callbacks=None): + super().__init__(model, mutator if mutator is not None else SPOSSupernetTrainingMutator(model), + loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid, + batch_size, workers, device, log_frequency, callbacks) + + self.train_loader = torch.utils.data.DataLoader(self.dataset_train, + batch_size=batch_size, + num_workers=workers) + self.valid_loader = torch.utils.data.DataLoader(self.dataset_valid, + batch_size=batch_size, + num_workers=workers) + + def train_one_epoch(self, epoch): + self.model.train() + meters = AverageMeterGroup() + for step, (x, y) in enumerate(self.train_loader): + x, y = x.to(self.device), y.to(self.device) + + self.optimizer.zero_grad() + logits = self.model(x) + loss = self.loss(x, y) + loss.backward() + self.optimizer.step() + + metrics = self.metrics(logits, y) + metrics["loss"] = loss.item() + meters.update(metrics) + if self.log_frequency is not None and step % self.log_frequency == 0: + logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, + self.num_epochs, step + 1, len(self.train_loader), meters) + + def validate_one_epoch(self, epoch): + self.model.validate() + meters = AverageMeterGroup() + with torch.no_grad(): + for step, (x, y) in enumerate(self.valid_loader): + x, y = x.to(self.device), y.to(self.device) + logits = self.model(x) + metrics = self.metrics(logits, y) + meters.update(metrics) + if self.log_frequency is not None and step % self.log_frequency == 0: + logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, + self.num_epochs, step + 1, len(self.valid_loader), meters) From c720373242c13d994479492a83efddceee7790a7 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 2 Dec 2019 06:04:23 +0000 Subject: [PATCH 07/57] checkpoint --- examples/nas/spos/network.py | 17 ----------------- examples/nas/spos/train_supernet.py | 13 ++++++++----- src/sdk/pynni/nni/nas/pytorch/spos/mutator.py | 2 +- src/sdk/pynni/nni/nas/pytorch/spos/trainer.py | 9 ++++++--- 4 files changed, 15 insertions(+), 26 deletions(-) diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 0d5dedee96..2e4f5081ce 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -7,7 +7,6 @@ from blocks import ShuffleNetBlock, ShuffleXceptionBlock from nni.nas.pytorch import mutables -from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator class ShuffleNetV2OneShot(nn.Module): @@ -147,19 +146,3 @@ def load_and_parse_state_dict(): k = re.sub(r"^(features.\d+).(\d+)", "\\1.choices.\\2", k) result[k] = v return result - - -if __name__ == "__main__": - # architecture = [0, 0, 3, 1, 1, 1, 0, 0, 2, 0, 2, 1, 1, 0, 2, 0, 2, 1, 3, 2] - # scale_list = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6] - # scale_ids = [6, 5, 3, 5, 2, 6, 3, 4, 2, 5, 7, 5, 4, 6, 7, 4, 4, 5, 4, 3] - model = ShuffleNetV2OneShot() - mutator = SPOSSupernetTrainingMutator(model, flops_func=model.get_candidate_flops, - flops_lb=290E6, flops_ub=360E6) - model_state_dict = load_and_parse_state_dict() - model.load_state_dict(model_state_dict) - - test_data = torch.rand(5, 3, 224, 224) - mutator.reset() - test_outputs = model(test_data) - print(test_outputs.size()) diff --git a/examples/nas/spos/train_supernet.py b/examples/nas/spos/train_supernet.py index e4f3757f2b..d8f09c811d 100644 --- a/examples/nas/spos/train_supernet.py +++ b/examples/nas/spos/train_supernet.py @@ -5,7 +5,7 @@ from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback from network import ShuffleNetV2OneShot -from src.sdk.pynni.nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer +from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy @@ -18,17 +18,19 @@ def on_epoch_begin(self, epoch): if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") - parser.add_argument("--imagenet-dir", type=int, default="./data/imagenet") + parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") parser.add_argument("--batch-size", type=int, default=1024) parser.add_argument("--epochs", type=int, default=15) parser.add_argument("--learning-rate", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--weight-decay", type=float, default=4E-5) parser.add_argument("--label-smooth", type=float, default=0.1) + parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() dataset_train, dataset_valid = get_imagenet(args.imagenet_dir) model = ShuffleNetV2OneShot() + model.cuda() model = nn.DataParallel(model) mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, flops_lb=290E6, flops_ub=360E6) @@ -37,9 +39,10 @@ def on_epoch_begin(self, epoch): momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: (1.0 - step / args.epochs) - if step <= args.total_iters else 0, + if step <= args.epochs else 0, last_epoch=-1) trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, - args.num_epochs, dataset_train, dataset_valid, - mutator=mutator, batch_size=args.batch_size, + args.epochs, dataset_train, dataset_valid, + mutator=mutator, batch_size=args.batch_size, log_frequency=args.log_frequency, callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum()]) + trainer.train() diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py index f7d0e6d71f..7345cb7636 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py @@ -16,7 +16,7 @@ def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None, self._flops_func = flops_func if self._flops_func is not None: self._flops_bin_num = flops_bin_num - self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num)] + self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num + 1)] self._flops_sample_timeout = flops_sample_timeout def sample_search(self): diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py index e90726c4dd..f18e3895dd 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py @@ -23,7 +23,8 @@ def __init__(self, model, loss, metrics, self.train_loader = torch.utils.data.DataLoader(self.dataset_train, batch_size=batch_size, - num_workers=workers) + num_workers=workers, + shuffle=True) self.valid_loader = torch.utils.data.DataLoader(self.dataset_valid, batch_size=batch_size, num_workers=workers) @@ -35,8 +36,9 @@ def train_one_epoch(self, epoch): x, y = x.to(self.device), y.to(self.device) self.optimizer.zero_grad() + self.mutator.reset() logits = self.model(x) - loss = self.loss(x, y) + loss = self.loss(logits, y) loss.backward() self.optimizer.step() @@ -48,11 +50,12 @@ def train_one_epoch(self, epoch): self.num_epochs, step + 1, len(self.train_loader), meters) def validate_one_epoch(self, epoch): - self.model.validate() + self.model.eval() meters = AverageMeterGroup() with torch.no_grad(): for step, (x, y) in enumerate(self.valid_loader): x, y = x.to(self.device), y.to(self.device) + self.mutator.reset() logits = self.model(x) metrics = self.metrics(logits, y) meters.update(metrics) From 2edf63703d8d723178d6a49376164748ed798af1 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 2 Dec 2019 12:33:28 +0000 Subject: [PATCH 08/57] checkpoint --- examples/nas/spos/network.py | 4 +- examples/nas/spos/train_supernet.py | 19 +++++-- examples/nas/spos/utils.py | 53 +++++++++++++++++-- src/sdk/pynni/nni/nas/pytorch/spos/trainer.py | 5 +- 4 files changed, 69 insertions(+), 12 deletions(-) diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 2e4f5081ce..42e3386347 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -137,8 +137,8 @@ def _initialize_weights(self): nn.init.constant_(m.bias, 0) -def load_and_parse_state_dict(): - checkpoint = torch.load("./data/checkpoint-150000.pth.tar") +def load_and_parse_state_dict(normalize_conv=False): + checkpoint = torch.load("./data/checkpoint-150000.pth.tar", map_location=torch.device("cpu")) result = dict() for k, v in checkpoint["state_dict"].items(): if k.startswith("module."): diff --git a/examples/nas/spos/train_supernet.py b/examples/nas/spos/train_supernet.py index d8f09c811d..35fc196745 100644 --- a/examples/nas/spos/train_supernet.py +++ b/examples/nas/spos/train_supernet.py @@ -4,7 +4,7 @@ import torch.nn as nn from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback -from network import ShuffleNetV2OneShot +from network import ShuffleNetV2OneShot, load_and_parse_state_dict from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy @@ -19,6 +19,11 @@ def on_epoch_begin(self, epoch): if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") + parser.add_argument("--load-checkpoint", action="store_true", default=False) + parser.add_argument("--spos-preprocessing", action="store_true", default=False, + help="When true, image values will range from 0 to 255 and use BGR " + "(as in original repo).") + parser.add_argument("--workers", type=int, default=4) parser.add_argument("--batch-size", type=int, default=1024) parser.add_argument("--epochs", type=int, default=15) parser.add_argument("--learning-rate", type=float, default=0.5) @@ -28,8 +33,12 @@ def on_epoch_begin(self, epoch): parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() - dataset_train, dataset_valid = get_imagenet(args.imagenet_dir) + dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) model = ShuffleNetV2OneShot() + if args.load_checkpoint: + if not args.spos_preprocessing: + print("You might want to use SPOS preprocessing if you are loading their checkpoints.") + model.load_state_dict(load_and_parse_state_dict()) model.cuda() model = nn.DataParallel(model) mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, @@ -43,6 +52,8 @@ def on_epoch_begin(self, epoch): last_epoch=-1) trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, args.epochs, dataset_train, dataset_valid, - mutator=mutator, batch_size=args.batch_size, log_frequency=args.log_frequency, + mutator=mutator, batch_size=args.batch_size, + log_frequency=args.log_frequency, workers=args.workers, callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum()]) - trainer.train() + # trainer.train() + trainer.validate() diff --git a/examples/nas/spos/utils.py b/examples/nas/spos/utils.py index 8f42460f28..2d95c9e988 100644 --- a/examples/nas/spos/utils.py +++ b/examples/nas/spos/utils.py @@ -1,22 +1,65 @@ import torch import torch.nn as nn +from PIL import Image +import numpy as np + from torchvision import transforms from torchvision.datasets import ImageNet -def get_imagenet(imagenet_root): +IMAGENET_MEAN = [0.485, 0.456, 0.406] +IMAGENET_STD = [0.229, 0.224, 0.225] + + +def spos_to_bgr_tensor(pic): + """Modified from `to_tensor`""" + if not isinstance(pic, Image.Image): + raise TypeError('pic should be PIL Image. Got {}'.format(type(pic))) + + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + elif pic.mode == 'F': + img = torch.from_numpy(np.array(pic, np.float32, copy=False)) + elif pic.mode == '1': + img = 255 * torch.from_numpy(np.array(pic, np.uint8, copy=False)) + else: + img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + img = img[:, :, [2, 1, 0]].transpose(0, 1).transpose(0, 2).contiguous() + return img.float() if isinstance(img, torch.ByteTensor) else img + + +def get_imagenet(imagenet_root, spos_pre): + if spos_pre: + postprocess = [ + transforms.Lambda(lambda img: spos_to_bgr_tensor(img)) + ] + else: + postprocess = [ + transforms.ToTensor(), + transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) + ] + train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.RandomHorizontalFlip(0.5), - transforms.ToTensor(), - ]) + ] + postprocess) valid_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), - transforms.ToTensor(), - ]) + ] + postprocess) train_dataset = ImageNet(imagenet_root, split="train", transform=train_transform) valid_dataset = ImageNet(imagenet_root, split="val", transform=valid_transform) return train_dataset, valid_dataset diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py index f18e3895dd..7f87b4d27f 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py @@ -27,7 +27,8 @@ def __init__(self, model, loss, metrics, shuffle=True) self.valid_loader = torch.utils.data.DataLoader(self.dataset_valid, batch_size=batch_size, - num_workers=workers) + num_workers=workers, + shuffle=True) def train_one_epoch(self, epoch): self.model.train() @@ -57,7 +58,9 @@ def validate_one_epoch(self, epoch): x, y = x.to(self.device), y.to(self.device) self.mutator.reset() logits = self.model(x) + loss = self.loss(logits, y) metrics = self.metrics(logits, y) + metrics["loss"] = loss meters.update(metrics) if self.log_frequency is not None and step % self.log_frequency == 0: logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, From 990932bc2ddb038e73c365aef11a37780c78983e Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 3 Dec 2019 02:29:46 +0000 Subject: [PATCH 09/57] checkpoint --- examples/nas/spos/network.py | 4 +- .../spos/{train_supernet.py => supernet.py} | 4 +- examples/nas/spos/tester.py | 53 +++++++++++++++++++ examples/nas/spos/utils.py | 5 ++ tools/nni_cmd/nnictl_utils.py | 7 ++- 5 files changed, 67 insertions(+), 6 deletions(-) rename examples/nas/spos/{train_supernet.py => supernet.py} (98%) create mode 100644 examples/nas/spos/tester.py diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 42e3386347..63bfab4fb1 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -137,8 +137,8 @@ def _initialize_weights(self): nn.init.constant_(m.bias, 0) -def load_and_parse_state_dict(normalize_conv=False): - checkpoint = torch.load("./data/checkpoint-150000.pth.tar", map_location=torch.device("cpu")) +def load_and_parse_state_dict(filepath="./data/checkpoint-150000.pth.tar"): + checkpoint = torch.load(filepath, map_location=torch.device("cpu")) result = dict() for k, v in checkpoint["state_dict"].items(): if k.startswith("module."): diff --git a/examples/nas/spos/train_supernet.py b/examples/nas/spos/supernet.py similarity index 98% rename from examples/nas/spos/train_supernet.py rename to examples/nas/spos/supernet.py index 35fc196745..f0b4127686 100644 --- a/examples/nas/spos/train_supernet.py +++ b/examples/nas/spos/supernet.py @@ -55,5 +55,5 @@ def on_epoch_begin(self, epoch): mutator=mutator, batch_size=args.batch_size, log_frequency=args.log_frequency, workers=args.workers, callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum()]) - # trainer.train() - trainer.validate() + trainer.train() + # trainer.validate() diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py new file mode 100644 index 0000000000..f53a43f026 --- /dev/null +++ b/examples/nas/spos/tester.py @@ -0,0 +1,53 @@ +import argparse + +import torch +import torch.nn as nn +from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback + +from network import ShuffleNetV2OneShot, load_and_parse_state_dict +from nni.nas.pytorch.classic_nas import Class +from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("SPOS Candidate Tester") + parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") + parser.add_argument("--checkpoint", type=str, default=) + parser.add_argument("--load-checkpoint", action="store_true", default=False) + parser.add_argument("--spos-preprocessing", action="store_true", default=False, + help="When true, image values will range from 0 to 255 and use BGR " + "(as in original repo).") + parser.add_argument("--workers", type=int, default=4) + parser.add_argument("--batch-size", type=int, default=1024) + parser.add_argument("--epochs", type=int, default=15) + parser.add_argument("--learning-rate", type=float, default=0.5) + parser.add_argument("--momentum", type=float, default=0.9) + parser.add_argument("--weight-decay", type=float, default=4E-5) + parser.add_argument("--label-smooth", type=float, default=0.1) + parser.add_argument("--log-frequency", type=int, default=10) + + args = parser.parse_args() + dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) + model = ShuffleNetV2OneShot() + if args.load_checkpoint: + if not args.spos_preprocessing: + print("You might want to use SPOS preprocessing if you are loading their checkpoints.") + model.load_state_dict(load_and_parse_state_dict()) + model.cuda() + model = nn.DataParallel(model) + mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, + flops_lb=290E6, flops_ub=360E6) + criterion = CrossEntropyLabelSmooth(1000, 0.1) + optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, + momentum=args.momentum, weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, + lambda step: (1.0 - step / args.epochs) + if step <= args.epochs else 0, + last_epoch=-1) + trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, + args.epochs, dataset_train, dataset_valid, + mutator=mutator, batch_size=args.batch_size, + log_frequency=args.log_frequency, workers=args.workers, + callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum()]) + trainer.train() + # trainer.validate() diff --git a/examples/nas/spos/utils.py b/examples/nas/spos/utils.py index 2d95c9e988..5a7ab1941c 100644 --- a/examples/nas/spos/utils.py +++ b/examples/nas/spos/utils.py @@ -1,3 +1,5 @@ +import os + import torch import torch.nn as nn @@ -41,6 +43,9 @@ def spos_to_bgr_tensor(pic): def get_imagenet(imagenet_root, spos_pre): + if not os.path.exists(imagenet_root): + raise FileNotFoundError("Imagenet root {} not exists. Pointing to the right directory with " + "command-line arguments.".format(imagenet_root)) if spos_pre: postprocess = [ transforms.Lambda(lambda img: spos_to_bgr_tensor(img)) diff --git a/tools/nni_cmd/nnictl_utils.py b/tools/nni_cmd/nnictl_utils.py index c884d4562b..b9c44055a5 100644 --- a/tools/nni_cmd/nnictl_utils.py +++ b/tools/nni_cmd/nnictl_utils.py @@ -685,7 +685,10 @@ def search_space_auto_gen(args): abs_file_path = os.path.join(os.getcwd(), file_path) assert os.path.exists(trial_dir) if os.path.exists(abs_file_path): - print_warning('%s already exits, will be over written' % abs_file_path) + print_warning('%s already exists, will be overwritten.' % abs_file_path) print_normal('Dry run to generate search space...') Popen(args.trial_command, cwd=trial_dir, env=dict(os.environ, NNI_GEN_SEARCH_SPACE=abs_file_path), shell=True).wait() - print_normal('Dry run to generate search space, Done') \ No newline at end of file + if not os.path.exists(abs_file_path): + print_warning('Expected search space file \'{}\' generated, but not found.'.format(abs_file_path)) + else: + print_normal('Generate search space done: \'{}\'.'.format(abs_file_path)) From 117399cada28edf4107ab95dedb22a3d4e352138 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 3 Dec 2019 11:52:54 +0800 Subject: [PATCH 10/57] finish tester --- examples/nas/spos/tester.py | 101 +++++++--- .../nni/nas/pytorch/classic_nas/__init__.py | 3 + .../nni/nas/pytorch/classic_nas/mutator.py | 185 ++++++++---------- src/sdk/pynni/nni/nas/pytorch/utils.py | 6 + 4 files changed, 161 insertions(+), 134 deletions(-) diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index f53a43f026..b8295ba542 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -1,53 +1,90 @@ import argparse +import logging +from itertools import cycle +import nni import torch -import torch.nn as nn -from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback +from nni.nas.pytorch.classic_nas import get_and_apply_next_architecture +from nni.nas.pytorch.utils import AverageMeterGroup +from torch.utils.data import DataLoader from network import ShuffleNetV2OneShot, load_and_parse_state_dict -from nni.nas.pytorch.classic_nas import Class from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy +logger = logging.getLogger("nni") + + +def retrain_bn(model, criterion, max_iters, log_freq, loader_train, device): + logger.info("Clear BN statistics...") + for m in model.modules(): + if isinstance(m, torch.nn.BatchNorm2d): + m.running_mean = torch.zeros_like(m.running_mean) + m.running_var = torch.ones_like(m.running_var) + logger.info("Train BN with training set (BN sanitize)...") + model.train() + + meters = AverageMeterGroup() + for step in range(max_iters): + inputs, targets = next(loader_train) + inputs, targets = inputs.to(device), targets.to(device) + logits = model(inputs) + loss = criterion(logits, targets) + metrics = accuracy(logits, targets) + metrics["loss"] = loss + meters.update(metrics) + if step % log_freq == 0 or step + 1 == max_iters: + logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters) + + +def test_acc(model, criterion, max_iters, log_freq, loader_test, device): + logger.info("Start testing...") + model.eval() + meters = AverageMeterGroup() + with torch.no_grad(): + for step in range(max_iters): + inputs, targets = next(loader_test) + inputs, targets = inputs.to(device), targets.to(device) + logits = model(inputs) + loss = criterion(logits, targets) + metrics = accuracy(logits, targets) + metrics["loss"] = loss + meters.update(metrics) + if step % log_freq == 0 or step + 1 == max_iters: + logger.info("Valid Step [%d/%d] %s", step + 1, max_iters, meters) + return meters.acc1.avg + + +def evaluate_acc(model, criterion, args, loader_train, loader_test, device): + retrain_bn(model, criterion, args.train_iters, args.log_frequency, loader_train, device) + acc = test_acc(model, criterion, args.max_iters, args.log_frequency, loader_test, device) + assert isinstance(acc, float) + nni.report_final_result(acc) + if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Candidate Tester") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") - parser.add_argument("--checkpoint", type=str, default=) - parser.add_argument("--load-checkpoint", action="store_true", default=False) + parser.add_argument("--checkpoint", type=str, default="./data/checkpoint-150000.pth.tar") parser.add_argument("--spos-preprocessing", action="store_true", default=False, help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") parser.add_argument("--workers", type=int, default=4) - parser.add_argument("--batch-size", type=int, default=1024) - parser.add_argument("--epochs", type=int, default=15) - parser.add_argument("--learning-rate", type=float, default=0.5) - parser.add_argument("--momentum", type=float, default=0.9) - parser.add_argument("--weight-decay", type=float, default=4E-5) - parser.add_argument("--label-smooth", type=float, default=0.1) + parser.add_argument("--batch-size", type=int, default=200) + parser.add_argument("--train-iters", type=int, default=128) + parser.add_argument("--test-iters", type=int, default=40) parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() + use_gpu = torch.cuda.is_available() + device = torch.device("cuda") if use_gpu else torch.device("cpu") dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) + loader_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=use_gpu) + loader_valid = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=use_gpu) + loader_train, loader_valid = cycle(loader_train), cycle(loader_valid) model = ShuffleNetV2OneShot() - if args.load_checkpoint: - if not args.spos_preprocessing: - print("You might want to use SPOS preprocessing if you are loading their checkpoints.") - model.load_state_dict(load_and_parse_state_dict()) - model.cuda() - model = nn.DataParallel(model) - mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, - flops_lb=290E6, flops_ub=360E6) criterion = CrossEntropyLabelSmooth(1000, 0.1) - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, - momentum=args.momentum, weight_decay=args.weight_decay) - scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, - lambda step: (1.0 - step / args.epochs) - if step <= args.epochs else 0, - last_epoch=-1) - trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, - args.epochs, dataset_train, dataset_valid, - mutator=mutator, batch_size=args.batch_size, - log_frequency=args.log_frequency, workers=args.workers, - callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum()]) - trainer.train() - # trainer.validate() + model.load_state_dict(load_and_parse_state_dict(filepath=args.checkpoint)) + model.to(device) + get_and_apply_next_architecture(model) diff --git a/src/sdk/pynni/nni/nas/pytorch/classic_nas/__init__.py b/src/sdk/pynni/nni/nas/pytorch/classic_nas/__init__.py index 0b31948bf9..ec3f5a4894 100644 --- a/src/sdk/pynni/nni/nas/pytorch/classic_nas/__init__.py +++ b/src/sdk/pynni/nni/nas/pytorch/classic_nas/__init__.py @@ -1 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from .mutator import get_and_apply_next_architecture diff --git a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py index cc56637bb2..4693d6cb40 100644 --- a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py @@ -1,15 +1,21 @@ -import os -import sys +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import json import logging +import os +import sys + import torch + import nni from nni.env_vars import trial_env_vars -from nni.nas.pytorch.base_mutator import BaseMutator from nni.nas.pytorch.mutables import LayerChoice, InputChoice +from nni.nas.pytorch.mutator import Mutator logger = logging.getLogger(__name__) + def get_and_apply_next_architecture(model): """ Wrapper of ClassicMutator to make it more meaningful, @@ -22,12 +28,14 @@ def get_and_apply_next_architecture(model): """ ClassicMutator(model) -class ClassicMutator(BaseMutator): + +class ClassicMutator(Mutator): """ This mutator is to apply the architecture chosen from tuner. It implements the forward function of LayerChoice and InputChoice, to only activate the chosen ones """ + def __init__(self, model): """ Generate search space based on ```model```. @@ -40,36 +48,73 @@ def __init__(self, model): Parameters ---------- - model : pytorch model + model : PyTorch model user's model with search space (e.g., LayerChoice, InputChoice) embedded in it """ super(ClassicMutator, self).__init__(model) - self.chosen_arch = {} - self.search_space = self._generate_search_space() - if 'NNI_GEN_SEARCH_SPACE' in os.environ: + self._chosen_arch = {} + self._search_space = self._generate_search_space() + if "NNI_GEN_SEARCH_SPACE" in os.environ: # dry run for only generating search space - self._dump_search_space(self.search_space, os.environ.get('NNI_GEN_SEARCH_SPACE')) + self._dump_search_space(self.search_space, os.environ["NNI_GEN_SEARCH_SPACE"]) sys.exit(0) - # get chosen arch from tuner - self.chosen_arch = nni.get_next_parameter() - if not self.chosen_arch and trial_env_vars.NNI_PLATFORM is None: - logger.warning('This is in standalone mode, the chosen are the first one(s)') - self.chosen_arch = self._standalone_generate_chosen() - self._validate_chosen_arch() - def _validate_chosen_arch(self): - pass + if trial_env_vars.NNI_PLATFORM is None: + logger.warning("This is in standalone mode, the chosen are the first one(s)") + self._chosen_arch = self._standalone_generate_chosen() + else: + # get chosen arch from tuner + self._chosen_arch = nni.get_next_parameter() + self.sample_final() + + def sample_search(self): + return self.sample_final() + + def sample_final(self): + assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \ + "Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(), + self._chosen_arch.keys()) + result = dict() + for mutable in self.mutables: + assert mutable.key in self._chosen_arch, "Expected '{}' in chosen arch, but not found.".format(mutable.key) + data = self._chosen_arch[mutable.key] + assert isinstance(data, dict) and "_value" in data and "_idx" in data, \ + "'{}' is not a valid choice.".format(data) + value = data["_value"] + idx = data["_idx"] + search_space_ref = self.search_space[mutable.key]["_value"] + if isinstance(mutable, LayerChoice): + # doesn't support multihot for layer choice yet + onehot_list = [False] * mutable.length + assert 0 <= idx < mutable.length and search_space_ref[idx] == value, \ + "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_ref, value) + onehot_list[idx] = True + result[mutable.key] = torch.tensor(onehot_list, dtype=torch.bool) # pylint: disable=not-callable + elif isinstance(mutable, InputChoice): + multihot_list = [False] * mutable.n_candidates + for i in idx: + assert 0 <= i < mutable.n_candidates and search_space_ref[i] == value, \ + "Index '{}' in search space '{}' is not '{}'".format(i, search_space_ref, value) + assert not multihot_list[i], "'{}' is selected twice in '{}', this is not allowed.".format(i, idx) + multihot_list[i] = True + result[mutable.key] = torch.tensor(multihot_list, dtype=torch.bool) # pylint: disable=not-callable + else: + raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) + + def reset(self): + pass # do nothing, only sample once at initialization def _standalone_generate_chosen(self): """ Generate the chosen architecture for standalone mode, - i.e., choose the first one(s) for LayerChoice and InputChoice + i.e., choose the first one(s) for LayerChoice and InputChoice. - { key_name: {'_value': "conv1", - '_idx': 0} } + :: + { key_name: {"_value": "conv1", + "_idx": 0} } - { key_name: {'_value': ["in1"], - '_idx': [0]} } + { key_name: {"_value": ["in1"], + "_idx": [0]} } Returns ------- @@ -78,15 +123,15 @@ def _standalone_generate_chosen(self): """ chosen_arch = {} for key, val in self.search_space.items(): - if val['_type'] == 'layer_choice': - choices = val['_value'] - chosen_arch[key] = {'_value': choices[0], '_idx': 0} - elif val['_type'] == 'input_choice': - choices = val['_value']['candidates'] - n_chosen = val['_value']['n_chosen'] - chosen_arch[key] = {'_value': choices[:n_chosen], '_idx': list(range(n_chosen))} + if val["_type"] == "layer_choice": + choices = val["_value"] + chosen_arch[key] = {"_value": choices[0], "_idx": 0} + elif val["_type"] == "input_choice": + choices = val["_value"]["candidates"] + n_chosen = val["_value"]["n_chosen"] + chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))} else: - raise ValueError('Unknown key %s and value %s' % (key, val)) + raise ValueError("Unknown key '%s' and value '%s'." % (key, val)) return chosen_arch def _generate_search_space(self): @@ -94,12 +139,13 @@ def _generate_search_space(self): Generate search space from mutables. Here is the search space format: - { key_name: {'_type': 'layer_choice', - '_value': ["conv1", "conv2"]} } + :: + { key_name: {"_type": "layer_choice", + "_value": ["conv1", "conv2"]} } - { key_name: {'_type': 'input_choice', - '_value': {'candidates': ["in1", "in2"], - 'n_chosen': 1}} } + { key_name: {"_type": "input_choice", + "_value": {"candidates": ["in1", "in2"], + "n_chosen": 1}} } Returns ------- @@ -119,74 +165,9 @@ def _generate_search_space(self): "_value": {"candidates": mutable.choose_from, "n_chosen": mutable.n_chosen}} else: - raise TypeError('Unsupported mutable type: %s.' % type(mutable)) + raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) return search_space def _dump_search_space(self, search_space, file_path): - with open(file_path, 'w') as ss_file: + with open(file_path, "w") as ss_file: json.dump(search_space, ss_file) - - def _tensor_reduction(self, reduction_type, tensor_list): - if tensor_list == "none": - return tensor_list - if not tensor_list: - return None # empty. return None for now - if len(tensor_list) == 1: - return tensor_list[0] - if reduction_type == "sum": - return sum(tensor_list) - if reduction_type == "mean": - return sum(tensor_list) / len(tensor_list) - if reduction_type == "concat": - return torch.cat(tensor_list, dim=1) - raise ValueError("Unrecognized reduction policy: \"{}\"".format(reduction_type)) - - def on_forward_layer_choice(self, mutable, *inputs): - """ - Implement the forward of LayerChoice - - Parameters - ---------- - mutable: LayerChoice - inputs: list of torch.Tensor - - Returns - ------- - tuple - return of the chosen op, the index of the chosen op - - """ - assert mutable.key in self.chosen_arch - val = self.chosen_arch[mutable.key] - assert isinstance(val, dict) - idx = val['_idx'] - assert self.search_space[mutable.key]['_value'][idx] == val['_value'] - return mutable.choices[idx](*inputs), idx - - def on_forward_input_choice(self, mutable, tensor_list): - """ - Implement the forward of InputChoice - - Parameters - ---------- - mutable: InputChoice - tensor_list: list of torch.Tensor - tags: list of string - - Returns - ------- - tuple of torch.Tensor and list - reduced tensor, mask list - - """ - assert mutable.key in self.chosen_arch - val = self.chosen_arch[mutable.key] - assert isinstance(val, dict) - mask = [0 for _ in range(mutable.n_candidates)] - out = [] - for i, idx in enumerate(val['_idx']): - # check whether idx matches the chosen candidate name - assert self.search_space[mutable.key]['_value']['candidates'][idx] == val['_value'][i] - out.append(tensor_list[idx]) - mask[idx] = 1 - return self._tensor_reduction(mutable.reduction, out), mask diff --git a/src/sdk/pynni/nni/nas/pytorch/utils.py b/src/sdk/pynni/nni/nas/pytorch/utils.py index 19244d015d..5666428213 100644 --- a/src/sdk/pynni/nni/nas/pytorch/utils.py +++ b/src/sdk/pynni/nni/nas/pytorch/utils.py @@ -23,6 +23,12 @@ def update(self, data): self.meters[k] = AverageMeter(k, ":4f") self.meters[k].update(v) + def __getattr__(self, item): + return self.meters[item] + + def __getitem__(self, item): + return self.meters[item] + def __str__(self): return " ".join(str(v) for _, v in self.meters.items()) From b4763d90077e4e3c387e18933ea1b13aa8bc927f Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 3 Dec 2019 04:28:48 +0000 Subject: [PATCH 11/57] fix bugs --- examples/nas/spos/tester.py | 13 ++++++++----- .../pynni/nni/nas/pytorch/classic_nas/mutator.py | 15 ++++++++------- src/sdk/pynni/nni/nas/pytorch/spos/trainer.py | 2 +- src/sdk/pynni/nni/nas/pytorch/utils.py | 5 +++++ 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index b8295ba542..96f408fa01 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -4,6 +4,7 @@ import nni import torch +import torch.nn as nn from nni.nas.pytorch.classic_nas import get_and_apply_next_architecture from nni.nas.pytorch.utils import AverageMeterGroup from torch.utils.data import DataLoader @@ -17,20 +18,21 @@ def retrain_bn(model, criterion, max_iters, log_freq, loader_train, device): logger.info("Clear BN statistics...") for m in model.modules(): - if isinstance(m, torch.nn.BatchNorm2d): + if isinstance(m, nn.BatchNorm2d): m.running_mean = torch.zeros_like(m.running_mean) m.running_var = torch.ones_like(m.running_var) + logger.info("Train BN with training set (BN sanitize)...") model.train() - meters = AverageMeterGroup() for step in range(max_iters): inputs, targets = next(loader_train) inputs, targets = inputs.to(device), targets.to(device) + model.zero_grad() logits = model(inputs) loss = criterion(logits, targets) metrics = accuracy(logits, targets) - metrics["loss"] = loss + metrics["loss"] = loss.item() meters.update(metrics) if step % log_freq == 0 or step + 1 == max_iters: logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters) @@ -47,7 +49,7 @@ def test_acc(model, criterion, max_iters, log_freq, loader_test, device): logits = model(inputs) loss = criterion(logits, targets) metrics = accuracy(logits, targets) - metrics["loss"] = loss + metrics["loss"] = loss.item() meters.update(metrics) if step % log_freq == 0 or step + 1 == max_iters: logger.info("Valid Step [%d/%d] %s", step + 1, max_iters, meters) @@ -56,7 +58,7 @@ def test_acc(model, criterion, max_iters, log_freq, loader_test, device): def evaluate_acc(model, criterion, args, loader_train, loader_test, device): retrain_bn(model, criterion, args.train_iters, args.log_frequency, loader_train, device) - acc = test_acc(model, criterion, args.max_iters, args.log_frequency, loader_test, device) + acc = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test, device) assert isinstance(acc, float) nni.report_final_result(acc) @@ -88,3 +90,4 @@ def evaluate_acc(model, criterion, args, loader_train, loader_test, device): model.load_state_dict(load_and_parse_state_dict(filepath=args.checkpoint)) model.to(device) get_and_apply_next_architecture(model) + evaluate_acc(model, criterion, args, loader_train, loader_valid, device) diff --git a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py index 4693d6cb40..5af692cdad 100644 --- a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py @@ -56,7 +56,7 @@ def __init__(self, model): self._search_space = self._generate_search_space() if "NNI_GEN_SEARCH_SPACE" in os.environ: # dry run for only generating search space - self._dump_search_space(self.search_space, os.environ["NNI_GEN_SEARCH_SPACE"]) + self._dump_search_space(os.environ["NNI_GEN_SEARCH_SPACE"]) sys.exit(0) if trial_env_vars.NNI_PLATFORM is None: @@ -65,7 +65,7 @@ def __init__(self, model): else: # get chosen arch from tuner self._chosen_arch = nni.get_next_parameter() - self.sample_final() + self._cache = self.sample_final() def sample_search(self): return self.sample_final() @@ -82,7 +82,7 @@ def sample_final(self): "'{}' is not a valid choice.".format(data) value = data["_value"] idx = data["_idx"] - search_space_ref = self.search_space[mutable.key]["_value"] + search_space_ref = self._search_space[mutable.key]["_value"] if isinstance(mutable, LayerChoice): # doesn't support multihot for layer choice yet onehot_list = [False] * mutable.length @@ -95,11 +95,12 @@ def sample_final(self): for i in idx: assert 0 <= i < mutable.n_candidates and search_space_ref[i] == value, \ "Index '{}' in search space '{}' is not '{}'".format(i, search_space_ref, value) - assert not multihot_list[i], "'{}' is selected twice in '{}', this is not allowed.".format(i, idx) + assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx) multihot_list[i] = True result[mutable.key] = torch.tensor(multihot_list, dtype=torch.bool) # pylint: disable=not-callable else: raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) + return result def reset(self): pass # do nothing, only sample once at initialization @@ -122,7 +123,7 @@ def _standalone_generate_chosen(self): the chosen architecture """ chosen_arch = {} - for key, val in self.search_space.items(): + for key, val in self._search_space.items(): if val["_type"] == "layer_choice": choices = val["_value"] chosen_arch[key] = {"_value": choices[0], "_idx": 0} @@ -168,6 +169,6 @@ def _generate_search_space(self): raise TypeError("Unsupported mutable type: '%s'." % type(mutable)) return search_space - def _dump_search_space(self, search_space, file_path): + def _dump_search_space(self, file_path): with open(file_path, "w") as ss_file: - json.dump(search_space, ss_file) + json.dump(self._search_space, ss_file) diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py index 7f87b4d27f..04086ec795 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py @@ -60,7 +60,7 @@ def validate_one_epoch(self, epoch): logits = self.model(x) loss = self.loss(logits, y) metrics = self.metrics(logits, y) - metrics["loss"] = loss + metrics["loss"] = loss.item() meters.update(metrics) if self.log_frequency is not None and step % self.log_frequency == 0: logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, diff --git a/src/sdk/pynni/nni/nas/pytorch/utils.py b/src/sdk/pynni/nni/nas/pytorch/utils.py index 5666428213..71cd2f8a0a 100644 --- a/src/sdk/pynni/nni/nas/pytorch/utils.py +++ b/src/sdk/pynni/nni/nas/pytorch/utils.py @@ -1,10 +1,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import logging from collections import OrderedDict _counter = 0 +_logger = logging.getLogger(__name__) + def global_mutable_counting(): global _counter @@ -58,6 +61,8 @@ def reset(self): self.count = 0 def update(self, val, n=1): + if not isinstance(val, float) and not isinstance(val, int): + _logger.warning("Values passed to AverageMeter must be number, not %s.", type(val)) self.val = val self.sum += val * n self.count += n From dfdb9fb7833228fdd0463735bae67a2fc7a2bcd1 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 3 Dec 2019 17:25:06 +0800 Subject: [PATCH 12/57] checkpoint --- examples/nas/.gitignore | 1 + examples/nas/spos/evolution.py | 145 ++++++++++++++++++ examples/nas/spos/network.py | 6 +- examples/nas/spos/tester.py | 13 +- .../nni/nas/pytorch/classic_nas/mutator.py | 8 +- 5 files changed, 163 insertions(+), 10 deletions(-) create mode 100644 examples/nas/spos/evolution.py diff --git a/examples/nas/.gitignore b/examples/nas/.gitignore index 8eeb0c2a3f..e26f9a17a1 100644 --- a/examples/nas/.gitignore +++ b/examples/nas/.gitignore @@ -1,3 +1,4 @@ data checkpoints runs +nni_auto_gen_search_space.json diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py new file mode 100644 index 0000000000..2b9ab8907f --- /dev/null +++ b/examples/nas/spos/evolution.py @@ -0,0 +1,145 @@ +import json +import logging +from collections import deque + +import numpy as np +from nni.tuner import Tuner + +from network import ShuffleNetV2OneShot + +_logger = logging.getLogger("nni") + + +class hashabledict(dict): + def __hash__(self): + return json.dumps(self, sort_keys=True) + + +class Evolution(Tuner): + + def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, + num_crossover=25, num_mutation=25, flops_limit=330E6): + assert num_population >= num_select + self.max_epochs = max_epochs + self.num_select = num_select + self.num_population = num_population + self.m_prob = m_prob + self.num_crossover = num_crossover + self.num_mutation = num_mutation + self.flops_limit = flops_limit + self.model = ShuffleNetV2OneShot() + self.epoch = 0 + self.candidates = [] + self.search_space = None + self.random_state = np.random.RandomState() + + # async status + self._to_evaluate_queue = deque() + self._sending_parameter_queue = deque() + self._pending_result_ids = set() + self._reward_dict = dict() + self._id2candidate = dict() + + def update_search_space(self, search_space): + self._search_space = search_space + self._next_round() + + def _next_round(self): + if self.epoch >= self.max_epochs: + return + _logger.info("Epoch %d, generating...", self.epoch) + if self.epoch == 0: + self.candidates = self._get_random_population() + else: + best_candidates = self._select_top_candidates() + self.candidates = self._get_mutation(best_candidates) + self._get_crossover(best_candidates) + \ + self._get_random_population() + self.epoch += 1 + + def _random_candidate(self): + chosen_arch = hashabledict() + for key, val in self._search_space.items(): + if val["_type"] == "layer_choice": + choices = val["_value"] + index = self.random_state.randint(len(choices)) + chosen_arch[key] = {"_value": choices[index], "_idx": index} + elif val["_type"] == "input_choice": + raise NotImplementedError("Input choice is not implemented yet.") + return chosen_arch + + def _add_to_evaluate_queue(self, cand): + self._reward_dict[cand] = 0. + self._to_evaluate_queue.append(cand) + + def _get_random_population(self): + result = [] + for _ in range(self.num_population): + while True: + cand = self._random_candidate() + if self._is_legal(cand): + result.append(cand) + self._add_to_evaluate_queue(cand) + break + return result + + def _get_crossover(self, best): + result = [] + for _ in range(10 * self.num_crossover): + cand_p1 = best[self.random_state.randint(len(best))] + cand_p2 = best[self.random_state.randint(len(best))] + assert cand_p1.keys() == cand_p2.keys() + cand = {k: cand_p1[k] if self.random_state.randint(2) == 0 else cand_p2[k] + for k in cand_p1.keys()} + if self._is_legal(cand): + result.append(cand) + self._add_to_evaluate_queue(cand) + if len(result) >= self.num_crossover: + break + return result + + def _get_mutation(self, best): + cand = best[self.random_state.randint(len(best))] + mutation_sample = np.random.random_sample(len(cand)) + for s, k in zip(mutation_sample, cand): + if s < self.m_prob: + choices = self._search_space[k]["_value"] + index = self.random_state.randint(len(choices)) + cand[k] = {"_value": choices[index], "_idx": index} + return cand + + def _is_legal(self, cand): + if cand in self._reward_dict: + return False + if self.model.get_candidate_flops(cand) > self.flops_limit: + return False + return True + + def _select_top_candidates(self): + return sorted(self.candidates, key=lambda cand: self._reward_dict[cand], reverse=True)[:self.num_select] + + def generate_multiple_parameters(self, parameter_id_list, **kwargs): + result = [] + for parameter_id in parameter_id_list: + self._sending_parameter_queue.append(parameter_id) + while self._sending_parameter_queue and self._to_evaluate_queue: + parameter_id = self._sending_parameter_queue.popleft() + parameters = self._to_evaluate_queue.popleft() + self._id2candidate[parameter_id] = parameters + result.append(parameters) + self._pending_result_ids.add(parameter_id) + return result + + def receive_trial_result(self, parameter_id, parameters, value, **kwargs): + self._reward_dict[self._id2candidate[parameter_id]] = value + + def trial_end(self, parameter_id, success, **kwargs): + self._pending_result_ids.remove(parameter_id) + if not self._pending_result_ids: + # a new epoch now + self._next_round() + + +if __name__ == "__main__": + tuner = Evolution() + tuner.update_search_space(json.load(open("nni_auto_gen_search_space.json", "r"))) + print(tuner.generate_multiple_parameters([_ for _ in range(20)])) diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 63bfab4fb1..eefbe5c99e 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -108,7 +108,11 @@ def get_candidate_flops(self, candidate): self._feature_map_size, self._feature_map_size, 1)] total_flops = conv1_flops + rest_flops for k, m in candidate.items(): - total_flops += self._parsed_flops[k][torch.max(m, 0)[1]] + parsed_flops_dict = self._parsed_flops[k] + if isinstance(m, dict): # to be compatible with classical nas format + total_flops += parsed_flops_dict[m["_idx"]] + else: + total_flops += [torch.max(m, 0)[1]] return total_flops def _initialize_weights(self): diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index 96f408fa01..17221d101c 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -79,15 +79,18 @@ def evaluate_acc(model, criterion, args, loader_train, loader_test, device): args = parser.parse_args() use_gpu = torch.cuda.is_available() device = torch.device("cuda") if use_gpu else torch.device("cpu") + + model = ShuffleNetV2OneShot() + criterion = CrossEntropyLabelSmooth(1000, 0.1) + get_and_apply_next_architecture(model) + model.load_state_dict(load_and_parse_state_dict(filepath=args.checkpoint)) + model.to(device) + dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) loader_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=use_gpu) loader_valid = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=use_gpu) loader_train, loader_valid = cycle(loader_train), cycle(loader_valid) - model = ShuffleNetV2OneShot() - criterion = CrossEntropyLabelSmooth(1000, 0.1) - model.load_state_dict(load_and_parse_state_dict(filepath=args.checkpoint)) - model.to(device) - get_and_apply_next_architecture(model) + evaluate_acc(model, criterion, args, loader_train, loader_valid, device) diff --git a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py index 5af692cdad..0fad38a1b7 100644 --- a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py @@ -92,9 +92,9 @@ def sample_final(self): result[mutable.key] = torch.tensor(onehot_list, dtype=torch.bool) # pylint: disable=not-callable elif isinstance(mutable, InputChoice): multihot_list = [False] * mutable.n_candidates - for i in idx: - assert 0 <= i < mutable.n_candidates and search_space_ref[i] == value, \ - "Index '{}' in search space '{}' is not '{}'".format(i, search_space_ref, value) + for i, v in zip(idx, value): + assert 0 <= i < mutable.n_candidates and search_space_ref[i] == v, \ + "Index '{}' in search space '{}' is not '{}'".format(i, search_space_ref, v) assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx) multihot_list[i] = True result[mutable.key] = torch.tensor(multihot_list, dtype=torch.bool) # pylint: disable=not-callable @@ -171,4 +171,4 @@ def _generate_search_space(self): def _dump_search_space(self, file_path): with open(file_path, "w") as ss_file: - json.dump(self._search_space, ss_file) + json.dump(self._search_space, ss_file, sort_keys=True, indent=2) From f16256df7478830ccbe224c45e635f8b02d3dc72 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 3 Dec 2019 17:38:01 +0800 Subject: [PATCH 13/57] checkpoint --- examples/nas/spos/evolution.py | 52 +++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py index 2b9ab8907f..aea670e996 100644 --- a/examples/nas/spos/evolution.py +++ b/examples/nas/spos/evolution.py @@ -10,11 +10,6 @@ _logger = logging.getLogger("nni") -class hashabledict(dict): - def __hash__(self): - return json.dumps(self, sort_keys=True) - - class Evolution(Tuner): def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, @@ -57,7 +52,7 @@ def _next_round(self): self.epoch += 1 def _random_candidate(self): - chosen_arch = hashabledict() + chosen_arch = dict() for key, val in self._search_space.items(): if val["_type"] == "layer_choice": choices = val["_value"] @@ -68,7 +63,7 @@ def _random_candidate(self): return chosen_arch def _add_to_evaluate_queue(self, cand): - self._reward_dict[cand] = 0. + self._reward_dict[self._hashcode(cand)] = 0. self._to_evaluate_queue.append(cand) def _get_random_population(self): @@ -98,24 +93,36 @@ def _get_crossover(self, best): return result def _get_mutation(self, best): - cand = best[self.random_state.randint(len(best))] - mutation_sample = np.random.random_sample(len(cand)) - for s, k in zip(mutation_sample, cand): - if s < self.m_prob: - choices = self._search_space[k]["_value"] - index = self.random_state.randint(len(choices)) - cand[k] = {"_value": choices[index], "_idx": index} - return cand + result = [] + for _ in range(10 * self.num_mutation): + cand = best[self.random_state.randint(len(best))] + mutation_sample = np.random.random_sample(len(cand)) + for s, k in zip(mutation_sample, cand): + if s < self.m_prob: + choices = self._search_space[k]["_value"] + index = self.random_state.randint(len(choices)) + cand[k] = {"_value": choices[index], "_idx": index} + if self._is_legal(cand): + result.append(cand) + self._add_to_evaluate_queue(cand) + if len(result) >= self.num_mutation: + break + return result def _is_legal(self, cand): - if cand in self._reward_dict: + if self._hashcode(cand) in self._reward_dict: return False if self.model.get_candidate_flops(cand) > self.flops_limit: return False return True def _select_top_candidates(self): - return sorted(self.candidates, key=lambda cand: self._reward_dict[cand], reverse=True)[:self.num_select] + return sorted(self.candidates, key=lambda cand: self._reward_dict[self._hashcode(cand)], + reverse=True)[:self.num_select] + + @staticmethod + def _hashcode(d): + return json.dumps(d, sort_keys=True) def generate_multiple_parameters(self, parameter_id_list, **kwargs): result = [] @@ -130,7 +137,7 @@ def generate_multiple_parameters(self, parameter_id_list, **kwargs): return result def receive_trial_result(self, parameter_id, parameters, value, **kwargs): - self._reward_dict[self._id2candidate[parameter_id]] = value + self._reward_dict[self._hashcode(self._id2candidate[parameter_id])] = value def trial_end(self, parameter_id, success, **kwargs): self._pending_result_ids.remove(parameter_id) @@ -142,4 +149,11 @@ def trial_end(self, parameter_id, success, **kwargs): if __name__ == "__main__": tuner = Evolution() tuner.update_search_space(json.load(open("nni_auto_gen_search_space.json", "r"))) - print(tuner.generate_multiple_parameters([_ for _ in range(20)])) + parameters = tuner.generate_multiple_parameters([_ for _ in range(20)]) + for i in range(20): + tuner.trial_end(i, False) + + for param in parameters: + for k, v in param.items(): + v.pop("_value") + print(param) From 3115531684ae7e5b59e64840ae78ba7a27bc7796 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 3 Dec 2019 20:29:17 +0800 Subject: [PATCH 14/57] fix a few issues --- examples/nas/spos/config_search.yml | 17 +++++++++++++++++ examples/nas/spos/evolution.py | 19 +++++-------------- examples/nas/spos/network.py | 5 +++-- .../nni/nas/pytorch/classic_nas/mutator.py | 2 +- 4 files changed, 26 insertions(+), 17 deletions(-) create mode 100644 examples/nas/spos/config_search.yml diff --git a/examples/nas/spos/config_search.yml b/examples/nas/spos/config_search.yml new file mode 100644 index 0000000000..cd3dbe7ef3 --- /dev/null +++ b/examples/nas/spos/config_search.yml @@ -0,0 +1,17 @@ +authorName: unknown +experimentName: SPOS Search +trialConcurrency: 4 +maxExecDuration: 7d +maxTrialNum: 99999 +trainingServicePlatform: local +searchSpacePath: nni_auto_gen_search_space.json +useAnnotation: false +tuner: + codeDir: . + classFileName: evolution.py + className: SPOSEvolution +trial: + # to fit in a GTX 1080 + command: python tester.py --imagenet-dir /data/hdd3/yugzh/imagenet --spos-prep --batch-size 80 --train-iters 250 --test-iters 100 + codeDir: . + gpuNum: 1 diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py index aea670e996..7879f09c2b 100644 --- a/examples/nas/spos/evolution.py +++ b/examples/nas/spos/evolution.py @@ -10,7 +10,7 @@ _logger = logging.getLogger("nni") -class Evolution(Tuner): +class SPOSEvolution(Tuner): def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, num_crossover=25, num_mutation=25, flops_limit=330E6): @@ -63,6 +63,7 @@ def _random_candidate(self): return chosen_arch def _add_to_evaluate_queue(self, cand): + _logger.info("Generate candidate with flops %d.", self.model.get_candidate_flops(cand)) self._reward_dict[self._hashcode(cand)] = 0. self._to_evaluate_queue.append(cand) @@ -90,6 +91,7 @@ def _get_crossover(self, best): self._add_to_evaluate_queue(cand) if len(result) >= self.num_crossover: break + _logger.info("Found %d architectures with crossover.", len(result)) return result def _get_mutation(self, best): @@ -107,6 +109,7 @@ def _get_mutation(self, best): self._add_to_evaluate_queue(cand) if len(result) >= self.num_mutation: break + _logger.info("Found %d architectures with mutation.", len(result)) return result def _is_legal(self, cand): @@ -134,6 +137,7 @@ def generate_multiple_parameters(self, parameter_id_list, **kwargs): self._id2candidate[parameter_id] = parameters result.append(parameters) self._pending_result_ids.add(parameter_id) + _logger.info("Requested %d parameters, %d sent.", len(parameter_id_list), len(result)) return result def receive_trial_result(self, parameter_id, parameters, value, **kwargs): @@ -144,16 +148,3 @@ def trial_end(self, parameter_id, success, **kwargs): if not self._pending_result_ids: # a new epoch now self._next_round() - - -if __name__ == "__main__": - tuner = Evolution() - tuner.update_search_space(json.load(open("nni_auto_gen_search_space.json", "r"))) - parameters = tuner.generate_multiple_parameters([_ for _ in range(20)]) - for i in range(20): - tuner.trial_end(i, False) - - for param in parameters: - for k, v in param.items(): - v.pop("_value") - print(param) diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index eefbe5c99e..14ae871cfd 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -1,3 +1,4 @@ +import os import pickle import re @@ -21,7 +22,7 @@ def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=10 super().__init__() assert input_size % 32 == 0 - with open("./data/op_flops_dict.pkl", "rb") as fp: + with open(os.path.join(os.path.dirname(__file__), "./data/op_flops_dict.pkl"), "rb") as fp: self._op_flops_dict = pickle.load(fp) self.stage_blocks = [4, 4, 8, 4] @@ -112,7 +113,7 @@ def get_candidate_flops(self, candidate): if isinstance(m, dict): # to be compatible with classical nas format total_flops += parsed_flops_dict[m["_idx"]] else: - total_flops += [torch.max(m, 0)[1]] + total_flops += parsed_flops_dict[torch.max(m, 0)[1]] return total_flops def _initialize_weights(self): diff --git a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py index 0fad38a1b7..a518356fa8 100644 --- a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py @@ -60,7 +60,7 @@ def __init__(self, model): sys.exit(0) if trial_env_vars.NNI_PLATFORM is None: - logger.warning("This is in standalone mode, the chosen are the first one(s)") + logger.warning("This is in standalone mode, the chosen are the first one(s).") self._chosen_arch = self._standalone_generate_chosen() else: # get chosen arch from tuner From 971822cdd108a6e88a784d49555037a12843d125 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 3 Dec 2019 15:52:19 +0000 Subject: [PATCH 15/57] add model checkpoint --- examples/nas/spos/supernet.py | 6 ++++-- src/sdk/pynni/nni/nas/pytorch/callbacks.py | 24 +++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index f0b4127686..480187603c 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -5,6 +5,7 @@ from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback from network import ShuffleNetV2OneShot, load_and_parse_state_dict +from nni.nas.pytorch.callbacks import ModelCheckpoint from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy @@ -25,7 +26,7 @@ def on_epoch_begin(self, epoch): "(as in original repo).") parser.add_argument("--workers", type=int, default=4) parser.add_argument("--batch-size", type=int, default=1024) - parser.add_argument("--epochs", type=int, default=15) + parser.add_argument("--epochs", type=int, default=120) parser.add_argument("--learning-rate", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--weight-decay", type=float, default=4E-5) @@ -54,6 +55,7 @@ def on_epoch_begin(self, epoch): args.epochs, dataset_train, dataset_valid, mutator=mutator, batch_size=args.batch_size, log_frequency=args.log_frequency, workers=args.workers, - callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum()]) + callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum(), + ModelCheckpoint("./checkpoints")]) trainer.train() # trainer.validate() diff --git a/src/sdk/pynni/nni/nas/pytorch/callbacks.py b/src/sdk/pynni/nni/nas/pytorch/callbacks.py index b24c2b6fe0..94dbea016a 100644 --- a/src/sdk/pynni/nni/nas/pytorch/callbacks.py +++ b/src/sdk/pynni/nni/nas/pytorch/callbacks.py @@ -4,6 +4,9 @@ import logging import os +import torch +import torch.nn as nn + _logger = logging.getLogger(__name__) @@ -51,4 +54,23 @@ def __init__(self, checkpoint_dir, every="epoch"): os.makedirs(self.checkpoint_dir, exist_ok=True) def on_epoch_end(self, epoch): - self.trainer.export(os.path.join(self.checkpoint_dir, "epoch_{}.json".format(epoch))) + dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.json".format(epoch)) + _logger.info("Saving architecture to %s", dest_path) + self.trainer.export(dest_path) + + +class ModelCheckpoint(Callback): + def __init__(self, checkpoint_dir, every="epoch"): + super().__init__() + assert every == "epoch" + self.checkpoint_dir = checkpoint_dir + os.makedirs(self.checkpoint_dir, exist_ok=True) + + def on_epoch_begin(self, epoch): + if isinstance(self.model, nn.DataParallel): + state_dict = self.model.module.state_dict() + else: + state_dict = self.model.state_dict() + dest_path = os.path.join(self.checkpoint_dir, "epoch_{}.pth.tar".format(epoch)) + _logger.info("Saving model to %s", dest_path) + torch.save(state_dict, dest_path) From 5f51bb58acc9c4df1ad52dd425467a735000a33f Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 3 Dec 2019 15:54:48 +0000 Subject: [PATCH 16/57] update --- src/sdk/pynni/nni/nas/pytorch/callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sdk/pynni/nni/nas/pytorch/callbacks.py b/src/sdk/pynni/nni/nas/pytorch/callbacks.py index 94dbea016a..c245a8c3a2 100644 --- a/src/sdk/pynni/nni/nas/pytorch/callbacks.py +++ b/src/sdk/pynni/nni/nas/pytorch/callbacks.py @@ -66,7 +66,7 @@ def __init__(self, checkpoint_dir, every="epoch"): self.checkpoint_dir = checkpoint_dir os.makedirs(self.checkpoint_dir, exist_ok=True) - def on_epoch_begin(self, epoch): + def on_epoch_end(self, epoch): if isinstance(self.model, nn.DataParallel): state_dict = self.model.module.state_dict() else: From 69261af0e8d3e70aa9f0beb5d6d94518b780d69f Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 4 Dec 2019 13:41:58 +0800 Subject: [PATCH 17/57] fix evolution tuner --- examples/nas/spos/evolution.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py index 7879f09c2b..2dfd804e67 100644 --- a/examples/nas/spos/evolution.py +++ b/examples/nas/spos/evolution.py @@ -34,6 +34,7 @@ def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, self._pending_result_ids = set() self._reward_dict = dict() self._id2candidate = dict() + self._st_callback = None def update_search_space(self, search_space): self._search_space = search_space @@ -63,7 +64,7 @@ def _random_candidate(self): return chosen_arch def _add_to_evaluate_queue(self, cand): - _logger.info("Generate candidate with flops %d.", self.model.get_candidate_flops(cand)) + _logger.info("Generate candidate with flops %d, adding to eval queue.", self.model.get_candidate_flops(cand)) self._reward_dict[self._hashcode(cand)] = 0. self._to_evaluate_queue.append(cand) @@ -120,31 +121,47 @@ def _is_legal(self, cand): return True def _select_top_candidates(self): - return sorted(self.candidates, key=lambda cand: self._reward_dict[self._hashcode(cand)], - reverse=True)[:self.num_select] + reward_query = lambda cand: self._reward_dict[self._hashcode(cand)] + result = sorted(self.candidates, key=reward_query, reverse=True)[:self.num_select] + _logger.info("Best candidate rewards: %s", list(map(reward_query, result))) + return result @staticmethod def _hashcode(d): return json.dumps(d, sort_keys=True) - def generate_multiple_parameters(self, parameter_id_list, **kwargs): + def _bind_and_send_parameters(self, use_st_callback=False): result = [] - for parameter_id in parameter_id_list: - self._sending_parameter_queue.append(parameter_id) while self._sending_parameter_queue and self._to_evaluate_queue: parameter_id = self._sending_parameter_queue.popleft() parameters = self._to_evaluate_queue.popleft() self._id2candidate[parameter_id] = parameters result.append(parameters) self._pending_result_ids.add(parameter_id) + if use_st_callback: + self._st_callback(parameter_id, parameters) + _logger.info("Sending extra parameter with callback.") + return result + + def generate_multiple_parameters(self, parameter_id_list, **kwargs): + if "st_callback" in kwargs: + self._st_callback = kwargs["st_callback"] + for parameter_id in parameter_id_list: + self._sending_parameter_queue.append(parameter_id) + result = self._bind_and_send_parameters() _logger.info("Requested %d parameters, %d sent.", len(parameter_id_list), len(result)) return result def receive_trial_result(self, parameter_id, parameters, value, **kwargs): + _logger.info("Candidate %d, reported reward %f", parameter_id, value) self._reward_dict[self._hashcode(self._id2candidate[parameter_id])] = value def trial_end(self, parameter_id, success, **kwargs): self._pending_result_ids.remove(parameter_id) - if not self._pending_result_ids: + if not self._pending_result_ids and not self._to_evaluate_queue: # a new epoch now self._next_round() + if self._st_callback is not None: + self._bind_and_send_parameters(use_st_callback=True) + else: + _logger.warning("No send callback found.") From e3dddf18e5a989d5e9a7ce104e32f0d94eb32a5a Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 4 Dec 2019 17:06:38 +0800 Subject: [PATCH 18/57] update --- examples/nas/spos/config_search.yml | 4 +- examples/nas/spos/scratch.py | 54 ++++++++++++++++++++ examples/nas/spos/supernet.py | 11 +--- examples/nas/spos/tester.py | 78 ++++++++++++++++++----------- 4 files changed, 108 insertions(+), 39 deletions(-) create mode 100644 examples/nas/spos/scratch.py diff --git a/examples/nas/spos/config_search.yml b/examples/nas/spos/config_search.yml index cd3dbe7ef3..1625c8f431 100644 --- a/examples/nas/spos/config_search.yml +++ b/examples/nas/spos/config_search.yml @@ -1,6 +1,6 @@ authorName: unknown experimentName: SPOS Search -trialConcurrency: 4 +trialConcurrency: 2 maxExecDuration: 7d maxTrialNum: 99999 trainingServicePlatform: local @@ -12,6 +12,6 @@ tuner: className: SPOSEvolution trial: # to fit in a GTX 1080 - command: python tester.py --imagenet-dir /data/hdd3/yugzh/imagenet --spos-prep --batch-size 80 --train-iters 250 --test-iters 100 + command: python tester.py --imagenet-dir /data/ssd1/v-yugzh/imagenet --spos-prep --workers 6 codeDir: . gpuNum: 1 diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py new file mode 100644 index 0000000000..601c32e7d8 --- /dev/null +++ b/examples/nas/spos/scratch.py @@ -0,0 +1,54 @@ +import argparse + +import torch +import torch.nn as nn +from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback + +from network import ShuffleNetV2OneShot, load_and_parse_state_dict +from nni.nas.pytorch.callbacks import ModelCheckpoint +from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer +from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("SPOS Supernet Training") + parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") + parser.add_argument("--load-checkpoint", action="store_true", default=False) + parser.add_argument("--spos-preprocessing", action="store_true", default=False, + help="When true, image values will range from 0 to 255 and use BGR " + "(as in original repo).") + parser.add_argument("--workers", type=int, default=4) + parser.add_argument("--batch-size", type=int, default=1024) + parser.add_argument("--epochs", type=int, default=120) + parser.add_argument("--learning-rate", type=float, default=1.) + parser.add_argument("--momentum", type=float, default=0.9) + parser.add_argument("--weight-decay", type=float, default=4E-5) + parser.add_argument("--label-smooth", type=float, default=0.1) + parser.add_argument("--log-frequency", type=int, default=10) + + args = parser.parse_args() + dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) + model = ShuffleNetV2OneShot() + if args.load_checkpoint: + if not args.spos_preprocessing: + print("You might want to use SPOS preprocessing if you are loading their checkpoints.") + model.load_state_dict(load_and_parse_state_dict()) + model.cuda() + model = nn.DataParallel(model) + mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, + flops_lb=290E6, flops_ub=360E6) + criterion = CrossEntropyLabelSmooth(1000, 0.1) + optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, + momentum=args.momentum, weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, + lambda step: (1.0 - step / args.epochs) + if step <= args.epochs else 0, + last_epoch=-1) + trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, + args.epochs, dataset_train, dataset_valid, + mutator=mutator, batch_size=args.batch_size, + log_frequency=args.log_frequency, workers=args.workers, + callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum(), + ModelCheckpoint("./checkpoints")]) + trainer.train() + # trainer.validate() diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index 480187603c..38c052fbd7 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -10,13 +10,6 @@ from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy -class AdjustBNMomentum(Callback): - def on_epoch_begin(self, epoch): - for m in self.model.modules(): - if isinstance(m, nn.BatchNorm2d): - m.momentum = 1 / (epoch + 1) - - if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") @@ -27,7 +20,7 @@ def on_epoch_begin(self, epoch): parser.add_argument("--workers", type=int, default=4) parser.add_argument("--batch-size", type=int, default=1024) parser.add_argument("--epochs", type=int, default=120) - parser.add_argument("--learning-rate", type=float, default=0.5) + parser.add_argument("--learning-rate", type=float, default=1.) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--weight-decay", type=float, default=4E-5) parser.add_argument("--label-smooth", type=float, default=0.1) @@ -55,7 +48,7 @@ def on_epoch_begin(self, epoch): args.epochs, dataset_train, dataset_valid, mutator=mutator, batch_size=args.batch_size, log_frequency=args.log_frequency, workers=args.workers, - callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum(), + callbacks=[LRSchedulerCallback(scheduler), ModelCheckpoint("./checkpoints")]) trainer.train() # trainer.validate() diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index 17221d101c..f0b5658d32 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -3,11 +3,12 @@ from itertools import cycle import nni +import numpy as np import torch import torch.nn as nn from nni.nas.pytorch.classic_nas import get_and_apply_next_architecture from nni.nas.pytorch.utils import AverageMeterGroup -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, SubsetRandomSampler from network import ShuffleNetV2OneShot, load_and_parse_state_dict from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy @@ -16,26 +17,26 @@ def retrain_bn(model, criterion, max_iters, log_freq, loader_train, device): - logger.info("Clear BN statistics...") - for m in model.modules(): - if isinstance(m, nn.BatchNorm2d): - m.running_mean = torch.zeros_like(m.running_mean) - m.running_var = torch.ones_like(m.running_var) - - logger.info("Train BN with training set (BN sanitize)...") - model.train() - meters = AverageMeterGroup() - for step in range(max_iters): - inputs, targets = next(loader_train) - inputs, targets = inputs.to(device), targets.to(device) - model.zero_grad() - logits = model(inputs) - loss = criterion(logits, targets) - metrics = accuracy(logits, targets) - metrics["loss"] = loss.item() - meters.update(metrics) - if step % log_freq == 0 or step + 1 == max_iters: - logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters) + with torch.no_grad(): + logger.info("Clear BN statistics...") + for m in model.modules(): + if isinstance(m, nn.BatchNorm2d): + m.running_mean = torch.zeros_like(m.running_mean) + m.running_var = torch.ones_like(m.running_var) + + logger.info("Train BN with training set (BN sanitize)...") + model.train() + meters = AverageMeterGroup() + for step in range(max_iters): + inputs, targets = next(loader_train) + inputs, targets = inputs.to(device), targets.to(device) + logits = model(inputs) + loss = criterion(logits, targets) + metrics = accuracy(logits, targets) + metrics["loss"] = loss.item() + meters.update(metrics) + if step % log_freq == 0 or step + 1 == max_iters: + logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters) def test_acc(model, criterion, max_iters, log_freq, loader_test, device): @@ -57,12 +58,20 @@ def test_acc(model, criterion, max_iters, log_freq, loader_test, device): def evaluate_acc(model, criterion, args, loader_train, loader_test, device): + acc_before = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test, device) + nni.report_intermediate_result(acc_before) + retrain_bn(model, criterion, args.train_iters, args.log_frequency, loader_train, device) acc = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test, device) assert isinstance(acc, float) nni.report_final_result(acc) +def generate_subset_indices(dataset, batch_size, iters): + dataset_length = len(dataset) + return np.random.choice(dataset_length, min(batch_size * iters * 2, dataset_length), replace=False) + + if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Candidate Tester") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") @@ -70,13 +79,24 @@ def evaluate_acc(model, criterion, args, loader_train, loader_test, device): parser.add_argument("--spos-preprocessing", action="store_true", default=False, help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") + parser.add_argument("--deterministic", action="store_true", default=False) parser.add_argument("--workers", type=int, default=4) - parser.add_argument("--batch-size", type=int, default=200) - parser.add_argument("--train-iters", type=int, default=128) - parser.add_argument("--test-iters", type=int, default=40) + parser.add_argument("--train-batch-size", type=int, default=128) + parser.add_argument("--train-iters", type=int, default=200) + parser.add_argument("--test-batch-size", type=int, default=1024) + parser.add_argument("--test-iters", type=int, default=10) parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() + + if args.deterministic: + # use a fixed set of image will improve the performance + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + np.random.seed(0) + random.seed(0) + torch.backends.cudnn.deterministic = True + use_gpu = torch.cuda.is_available() device = torch.device("cuda") if use_gpu else torch.device("cpu") @@ -87,10 +107,12 @@ def evaluate_acc(model, criterion, args, loader_train, loader_test, device): model.to(device) dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) - loader_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, - num_workers=args.workers, pin_memory=use_gpu) - loader_valid = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=True, - num_workers=args.workers, pin_memory=use_gpu) + sampler_train = SubsetRandomSampler(generate_subset_indices(dataset_train, args.train_batch_size, args.train_iters)) + sampler_valid = SubsetRandomSampler(generate_subset_indices(dataset_valid, args.test_batch_size, args.test_iters)) + loader_train = DataLoader(dataset_train, batch_size=args.train_batch_size, + sampler=sampler_train, num_workers=args.workers) + loader_valid = DataLoader(dataset_valid, batch_size=args.test_batch_size, + sampler=sampler_valid, num_workers=args.workers) loader_train, loader_valid = cycle(loader_train), cycle(loader_valid) evaluate_acc(model, criterion, args, loader_train, loader_valid, device) From bc383662233ccb23678aac9caa971322080179a6 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 4 Dec 2019 17:39:20 +0800 Subject: [PATCH 19/57] update training from scratch --- examples/nas/spos/readme.md | 3 ++ examples/nas/spos/scratch.py | 58 +++++++++++++++++++++++++++++------- 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/readme.md index 1b7c4a8924..b2f2697098 100644 --- a/examples/nas/spos/readme.md +++ b/examples/nas/spos/readme.md @@ -6,3 +6,6 @@ Single Path One-Shot by Megvii Research. Need to download the flops lookup table from [here](https://1drv.ms/u/s!Am_mmG2-KsrnajesvSdfsq_cN48?e=aHVppN). Put `op_flops_dict.pkl` under `data` directory. + +## Step 1. Train Supernet + diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 601c32e7d8..1d735a0dcf 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -1,4 +1,5 @@ import argparse +import logging import torch import torch.nn as nn @@ -6,20 +7,60 @@ from network import ShuffleNetV2OneShot, load_and_parse_state_dict from nni.nas.pytorch.callbacks import ModelCheckpoint -from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer +from nni.nas.pytorch.fixed import apply_fixed_architecture +from nni.nas.pytorch.utils import AverageMeterGroup from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy +logger = logging.getLogger("nni") + + +def train(epoch, model, criterion, optimizer, loader, args): + model.train() + meters = AverageMeterGroup() + for step, (x, y) in enumerate(loader): + x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True) + optimizer.zero_grad() + logits = model(x) + loss = criterion(logits, y) + loss.backward() + optimizer.step() + + metrics = accuracy(logits, y) + metrics["loss"] = loss.item() + meters.update(metrics) + if step % args.log_frequency == 0 or step + 1 == len(loader): + logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, + args.epochs, step + 1, len(loader), meters) + + +def validate(epoch, model, criterion, loader, args): + model.eval() + meters = AverageMeterGroup() + with torch.no_grad(): + for step, (x, y) in enumerate(loader): + x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True) + logits = model(x) + loss = criterion(logits, y) + metrics = accuracy(logits, y) + metrics["loss"] = loss.item() + meters.update(metrics) + if step % args.log_frequency == 0 or step + 1 == len(loader): + logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, + args.epochs, step + 1, len(loader), meters) + + if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") parser.add_argument("--load-checkpoint", action="store_true", default=False) + parser.add_argument("--architecture", type=str, required=True) parser.add_argument("--spos-preprocessing", action="store_true", default=False, help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") parser.add_argument("--workers", type=int, default=4) parser.add_argument("--batch-size", type=int, default=1024) - parser.add_argument("--epochs", type=int, default=120) + parser.add_argument("--epochs", type=int, default=240) parser.add_argument("--learning-rate", type=float, default=1.) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--weight-decay", type=float, default=4E-5) @@ -34,6 +75,7 @@ print("You might want to use SPOS preprocessing if you are loading their checkpoints.") model.load_state_dict(load_and_parse_state_dict()) model.cuda() + apply_fixed_architecture(model, args.fixed_arc_path) model = nn.DataParallel(model) mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, flops_lb=290E6, flops_ub=360E6) @@ -44,11 +86,7 @@ lambda step: (1.0 - step / args.epochs) if step <= args.epochs else 0, last_epoch=-1) - trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, - args.epochs, dataset_train, dataset_valid, - mutator=mutator, batch_size=args.batch_size, - log_frequency=args.log_frequency, workers=args.workers, - callbacks=[LRSchedulerCallback(scheduler), AdjustBNMomentum(), - ModelCheckpoint("./checkpoints")]) - trainer.train() - # trainer.validate() + for epoch in range(args.epochs): + train(epoch, model, criterion, optimizer, train_loader, args) + validate(epoch, model, criterion, test_loader, args) + scheduler.step() From a445e54f418f7e2b4af5c7e308fb0ce9cc92a510 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Thu, 5 Dec 2019 20:06:58 +0800 Subject: [PATCH 20/57] add decision --- src/sdk/pynni/nni/nas/pytorch/decision.py | 65 +++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 src/sdk/pynni/nni/nas/pytorch/decision.py diff --git a/src/sdk/pynni/nni/nas/pytorch/decision.py b/src/sdk/pynni/nni/nas/pytorch/decision.py new file mode 100644 index 0000000000..7d3b1bd0e5 --- /dev/null +++ b/src/sdk/pynni/nni/nas/pytorch/decision.py @@ -0,0 +1,65 @@ +class Decision: + def __init__(self): + raise NotImplementedError("You should never use init to initialize a general decision.") + + @classmethod + def from_nni_protocol_format(cls, candidate, search_space): + assert "_idx" in candidate and "_val" in candidate, "A candidate must have '_idx' and '_val' in its fields." + assert type(candidate["_idx"]) == type(candidate["_val"]), "Indices and values must have the same type." + search_space_values = search_space["_values"] + if isinstance(candidate["_idx"], list): + assert len(candidate["_idx"]) == len(candidate["_val"]), \ + "Number of indices must be equal of number of values." + for idx, val in zip(candidate["_idx"], candidate["_val"]): + assert 0 <= idx < len(search_space_values) and search_space_values[idx] == val, \ + "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_values, val) + elif isinstance(candidate["_idx"], int): + idx, val = candidate["_idx"], candidate["_val"] + assert 0 <= idx < len(search_space_values) and search_space_values[idx] == val, \ + "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_values, val) + else: + raise ValueError("Index of unrecognized type: {}".format(candidate["_idx"])) + return cls.from_indices(candidate["_idx"], len(search_space_values)) + + @classmethod + def from_indices(cls, indices, n_candidates): + """ + Construct a decision from indices. + + Parameters + ---------- + indices : int or list of int + n_candidates : int + + Returns + ------- + RelaxedDecision + """ + return RelaxedDecision(indices, n_candidates) + + @classmethod + def deserialize(cls, obj): + pass + + def serialize(self): + raise NotImplementedError + + +class RelaxedDecision(Decision): + def __init__(self, indices, n_candidates): + if isinstance(indices, int): + self.indices = [indices] + elif isinstance(indices, list): + self.indices = indices + assert len(set(self.indices)) == len(self.indices), "Indices must be unique" + assert all(map(lambda x: 0 <= x < n_candidates, self.indices)), "Indices must be in range [0, n_candidates)." + self.n_candidates = n_candidates + + @classmethod + def from_multi_hot_iterable(cls, iterable): + indices, total = [], 0 + for i, t in enumerate(iterable): + if t: + indices.append(i) + total += 1 + return cls(indices, total) From 99b3b740048d45d1b13cfec295de80186c6333b4 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Fri, 6 Dec 2019 11:50:42 +0800 Subject: [PATCH 21/57] decision class track in --- src/sdk/pynni/nni/nas/pytorch/decision.py | 113 ++++++++++++++++++---- 1 file changed, 96 insertions(+), 17 deletions(-) diff --git a/src/sdk/pynni/nni/nas/pytorch/decision.py b/src/sdk/pynni/nni/nas/pytorch/decision.py index 7d3b1bd0e5..2f9eddf1d0 100644 --- a/src/sdk/pynni/nni/nas/pytorch/decision.py +++ b/src/sdk/pynni/nni/nas/pytorch/decision.py @@ -1,28 +1,38 @@ +import logging + +import numpy as np +import torch + +_logger = logging.getLogger(__name__) + + class Decision: def __init__(self): raise NotImplementedError("You should never use init to initialize a general decision.") @classmethod - def from_nni_protocol_format(cls, candidate, search_space): + def from_nni_protocol_format(cls, candidate, search_space=None): assert "_idx" in candidate and "_val" in candidate, "A candidate must have '_idx' and '_val' in its fields." assert type(candidate["_idx"]) == type(candidate["_val"]), "Indices and values must have the same type." - search_space_values = search_space["_values"] - if isinstance(candidate["_idx"], list): - assert len(candidate["_idx"]) == len(candidate["_val"]), \ - "Number of indices must be equal of number of values." - for idx, val in zip(candidate["_idx"], candidate["_val"]): + if search_space is not None: + search_space_values = search_space["_values"] + if isinstance(candidate["_idx"], list): + assert len(candidate["_idx"]) == len(candidate["_val"]), \ + "Number of indices must be equal of number of values." + for idx, val in zip(candidate["_idx"], candidate["_val"]): + assert 0 <= idx < len(search_space_values) and search_space_values[idx] == val, \ + "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_values, val) + elif isinstance(candidate["_idx"], int): + idx, val = candidate["_idx"], candidate["_val"] assert 0 <= idx < len(search_space_values) and search_space_values[idx] == val, \ "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_values, val) - elif isinstance(candidate["_idx"], int): - idx, val = candidate["_idx"], candidate["_val"] - assert 0 <= idx < len(search_space_values) and search_space_values[idx] == val, \ - "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_values, val) - else: - raise ValueError("Index of unrecognized type: {}".format(candidate["_idx"])) - return cls.from_indices(candidate["_idx"], len(search_space_values)) + else: + raise ValueError("Index of unrecognized type: {}".format(candidate["_idx"])) + return cls.from_indices(candidate["_idx"], len(search_space_values)) + return cls.from_indices(candidate["_idx"]) @classmethod - def from_indices(cls, indices, n_candidates): + def from_indices(cls, indices, n_candidates=None): """ Construct a decision from indices. @@ -39,21 +49,67 @@ def from_indices(cls, indices, n_candidates): @classmethod def deserialize(cls, obj): - pass + if obj is None: + return EmptyDecision() + if isinstance(obj, dict) and "_idx" in obj: + return cls.from_nni_protocol_format(obj) + if isinstance(obj, int): + return cls.from_indices(obj) + obj_type = cls._list_type(obj) + if obj_type == int: + # list of indices + return cls.from_indices(obj) + if obj_type == float: + # list of weights + return ContinuousDecision(obj) + if obj_type == bool: + # one/multi-hot tensor + return RelaxedDecision.from_multi_hot_iterable(obj) + + @staticmethod + def _list_type(lst): + # get the element type of a list / tensor + + def _print_all_01_warning(): + if all_01: + _logger.warning("All elements in %s are 0 and 1, but type is not bool.", lst) + + all_01 = all(map(lambda x: x in [0., 1.], lst)) + if torch.is_tensor(lst): + type_lower = lst.type().lower() + if "bool" in type_lower: + return bool + _print_all_01_warning() + if "float" in type_lower: + return float + raise ValueError("Unsupported tensor type: {}".format(type_lower)) + if all(map(lambda x: isinstance(x, bool), lst)): + return bool + _print_all_01_warning() + for t in (int, float): + if all(map(lambda x: isinstance(x, t), lst)): + return t def serialize(self): raise NotImplementedError +class EmptyDecision(Decision): + def serialize(self): + return None + + class RelaxedDecision(Decision): - def __init__(self, indices, n_candidates): + def __init__(self, indices, n_candidates=None): if isinstance(indices, int): self.indices = [indices] elif isinstance(indices, list): self.indices = indices assert len(set(self.indices)) == len(self.indices), "Indices must be unique" - assert all(map(lambda x: 0 <= x < n_candidates, self.indices)), "Indices must be in range [0, n_candidates)." self.n_candidates = n_candidates + if n_candidates is not None: + assert all(map(lambda x: 0 <= x < n_candidates, self.indices)), \ + "Indices must be in range [0, n_candidates)." @classmethod def from_multi_hot_iterable(cls, iterable): @@ -63,3 +119,26 @@ def from_multi_hot_iterable(cls, iterable): indices.append(i) total += 1 return cls(indices, total) + + def serialize(self): + if len(self.indices) == 1: + return self.index + return self.indices + + @property + def index(self): + if len(self.indices) > 1: + raise ValueError("More than one indices. Index doesn't work.") + return self.indices[0] + + +class ContinuousDecision: + def __init__(self, weights): + self.weights = weights + + def serialize(self): + if torch.is_tensor(self.weights): + return self.weights.detach().numpy().tolist() + if isinstance(self.weights, np.ndarray): + return self.weights.tolist() + return self.weights From bdfc2e7d18905c7560d5d1048669944ba064005c Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Fri, 6 Dec 2019 12:09:45 +0800 Subject: [PATCH 22/57] decision class track in --- src/sdk/pynni/nni/nas/pytorch/decision.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/sdk/pynni/nni/nas/pytorch/decision.py b/src/sdk/pynni/nni/nas/pytorch/decision.py index 2f9eddf1d0..e4fb59e67d 100644 --- a/src/sdk/pynni/nni/nas/pytorch/decision.py +++ b/src/sdk/pynni/nni/nas/pytorch/decision.py @@ -98,6 +98,9 @@ class EmptyDecision(Decision): def serialize(self): return None + def __iter__(self): + raise StopIteration + class RelaxedDecision(Decision): def __init__(self, indices, n_candidates=None): @@ -131,6 +134,9 @@ def index(self): raise ValueError("More than one indices. Index doesn't work.") return self.indices[0] + def __iter__(self): + return iter(self.indices) + class ContinuousDecision: def __init__(self, weights): @@ -142,3 +148,6 @@ def serialize(self): if isinstance(self.weights, np.ndarray): return self.weights.tolist() return self.weights + + def __iter__(self): + return iter(self.weights) From 1a55007cdaef808d79c439db57c3747028e749a0 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Fri, 6 Dec 2019 12:36:56 +0800 Subject: [PATCH 23/57] update from scratch training code --- examples/nas/spos/architecture_final.json | 122 ++++++++++++++++++++++ examples/nas/spos/scratch.py | 31 +++--- 2 files changed, 134 insertions(+), 19 deletions(-) create mode 100644 examples/nas/spos/architecture_final.json diff --git a/examples/nas/spos/architecture_final.json b/examples/nas/spos/architecture_final.json new file mode 100644 index 0000000000..66d07d9106 --- /dev/null +++ b/examples/nas/spos/architecture_final.json @@ -0,0 +1,122 @@ +{ + "LayerChoice1": [ + false, + false, + true, + false + ], + "LayerChoice10": [ + true, + false, + false, + false + ], + "LayerChoice11": [ + false, + false, + true, + false + ], + "LayerChoice12": [ + false, + false, + false, + true + ], + "LayerChoice13": [ + true, + false, + false, + false + ], + "LayerChoice14": [ + true, + false, + false, + false + ], + "LayerChoice15": [ + true, + false, + false, + false + ], + "LayerChoice16": [ + true, + false, + false, + false + ], + "LayerChoice17": [ + false, + false, + false, + true + ], + "LayerChoice18": [ + false, + false, + true, + false + ], + "LayerChoice19": [ + false, + false, + false, + true + ], + "LayerChoice2": [ + false, + true, + false, + false + ], + "LayerChoice20": [ + false, + false, + false, + true + ], + "LayerChoice3": [ + true, + false, + false, + false + ], + "LayerChoice4": [ + false, + true, + false, + false + ], + "LayerChoice5": [ + false, + false, + true, + false + ], + "LayerChoice6": [ + true, + false, + false, + false + ], + "LayerChoice7": [ + false, + false, + true, + false + ], + "LayerChoice8": [ + true, + false, + false, + false + ], + "LayerChoice9": [ + false, + false, + true, + false + ] +} diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 1d735a0dcf..1bbdf9750b 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -3,14 +3,12 @@ import torch import torch.nn as nn -from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback - -from network import ShuffleNetV2OneShot, load_and_parse_state_dict -from nni.nas.pytorch.callbacks import ModelCheckpoint from nni.nas.pytorch.fixed import apply_fixed_architecture from nni.nas.pytorch.utils import AverageMeterGroup -from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy +from torch.utils.data.dataloader import DataLoader +from network import ShuffleNetV2OneShot, load_and_parse_state_dict +from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy logger = logging.getLogger("nni") @@ -48,17 +46,14 @@ def validate(epoch, model, criterion, loader, args): if step % args.log_frequency == 0 or step + 1 == len(loader): logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, args.epochs, step + 1, len(loader), meters) + logger.info("Epoch %d validation: %s", epoch + 1, meters) if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") - parser.add_argument("--load-checkpoint", action="store_true", default=False) parser.add_argument("--architecture", type=str, required=True) - parser.add_argument("--spos-preprocessing", action="store_true", default=False, - help="When true, image values will range from 0 to 255 and use BGR " - "(as in original repo).") - parser.add_argument("--workers", type=int, default=4) + parser.add_argument("--workers", type=int, default=8) parser.add_argument("--batch-size", type=int, default=1024) parser.add_argument("--epochs", type=int, default=240) parser.add_argument("--learning-rate", type=float, default=1.) @@ -68,17 +63,15 @@ def validate(epoch, model, criterion, loader, args): parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() - dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) + dataset_train, dataset_valid = get_imagenet(args.imagenet_dir) + train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=True) + valid_loader = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True) model = ShuffleNetV2OneShot() - if args.load_checkpoint: - if not args.spos_preprocessing: - print("You might want to use SPOS preprocessing if you are loading their checkpoints.") - model.load_state_dict(load_and_parse_state_dict()) model.cuda() - apply_fixed_architecture(model, args.fixed_arc_path) + apply_fixed_architecture(model, args.architecture) model = nn.DataParallel(model) - mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, - flops_lb=290E6, flops_ub=360E6) criterion = CrossEntropyLabelSmooth(1000, 0.1) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) @@ -88,5 +81,5 @@ def validate(epoch, model, criterion, loader, args): last_epoch=-1) for epoch in range(args.epochs): train(epoch, model, criterion, optimizer, train_loader, args) - validate(epoch, model, criterion, test_loader, args) + validate(epoch, model, criterion, valid_loader, args) scheduler.step() From c7a10d258475f08e363028bb519b03738dab5949 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 6 Dec 2019 13:30:01 +0800 Subject: [PATCH 24/57] update --- examples/nas/spos/scratch.py | 34 +++++++++++++++++++++++++++------- examples/nas/spos/supernet.py | 2 +- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 1bbdf9750b..2cb7f5c7fb 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -6,6 +6,7 @@ from nni.nas.pytorch.fixed import apply_fixed_architecture from nni.nas.pytorch.utils import AverageMeterGroup from torch.utils.data.dataloader import DataLoader +from torch.utils.tensorboard import SummaryWriter from network import ShuffleNetV2OneShot, load_and_parse_state_dict from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy @@ -13,10 +14,13 @@ logger = logging.getLogger("nni") -def train(epoch, model, criterion, optimizer, loader, args): +def train(epoch, model, criterion, optimizer, loader, writer, args): model.train() meters = AverageMeterGroup() + cur_lr = optimizer.param_groups[0]["lr"] + for step, (x, y) in enumerate(loader): + cur_step = len(loader) * epoch + step x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True) optimizer.zero_grad() logits = model(x) @@ -27,12 +31,18 @@ def train(epoch, model, criterion, optimizer, loader, args): metrics = accuracy(logits, y) metrics["loss"] = loss.item() meters.update(metrics) + + writer.add_scalar("lr", cur_lr, global_step=cur_step) + writer.add_scalar("loss/train", loss.item(), global_step=cur_step) + writer.add_scalar("acc1/train", metrics["acc1"], global_step=cur_step) + writer.add_scalar("acc5/train", metrics["acc5"], global_step=cur_step) + if step % args.log_frequency == 0 or step + 1 == len(loader): logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, args.epochs, step + 1, len(loader), meters) -def validate(epoch, model, criterion, loader, args): +def validate(epoch, model, criterion, loader, writer, args): model.eval() meters = AverageMeterGroup() with torch.no_grad(): @@ -43,27 +53,34 @@ def validate(epoch, model, criterion, loader, args): metrics = accuracy(logits, y) metrics["loss"] = loss.item() meters.update(metrics) + if step % args.log_frequency == 0 or step + 1 == len(loader): logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, args.epochs, step + 1, len(loader), meters) + + writer.add_scalar("loss/test", meters.loss.avg, global_step=cur_step) + writer.add_scalar("acc1/test", meters.acc1.avg, global_step=cur_step) + writer.add_scalar("acc5/test", meters.acc5.avg, global_step=cur_step) + logger.info("Epoch %d validation: %s", epoch + 1, meters) if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") - parser.add_argument("--architecture", type=str, required=True) + parser.add_argument("--tb-dir", type=str, default="runs") + parser.add_argument("--architecture", type=str, default="architecture_final.json") parser.add_argument("--workers", type=int, default=8) parser.add_argument("--batch-size", type=int, default=1024) parser.add_argument("--epochs", type=int, default=240) - parser.add_argument("--learning-rate", type=float, default=1.) + parser.add_argument("--learning-rate", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--weight-decay", type=float, default=4E-5) parser.add_argument("--label-smooth", type=float, default=0.1) parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() - dataset_train, dataset_valid = get_imagenet(args.imagenet_dir) + dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, False) train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) valid_loader = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=False, @@ -79,7 +96,10 @@ def validate(epoch, model, criterion, loader, args): lambda step: (1.0 - step / args.epochs) if step <= args.epochs else 0, last_epoch=-1) + writer = SummaryWriter(log_dir=args.tb_dir) + for epoch in range(args.epochs): - train(epoch, model, criterion, optimizer, train_loader, args) - validate(epoch, model, criterion, valid_loader, args) + train(epoch, model, criterion, optimizer, train_loader, writer, args) + validate(epoch, model, criterion, valid_loader, writer, args) scheduler.step() + writer.close() diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index 38c052fbd7..24711b36a6 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -20,7 +20,7 @@ parser.add_argument("--workers", type=int, default=4) parser.add_argument("--batch-size", type=int, default=1024) parser.add_argument("--epochs", type=int, default=120) - parser.add_argument("--learning-rate", type=float, default=1.) + parser.add_argument("--learning-rate", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--weight-decay", type=float, default=4E-5) parser.add_argument("--label-smooth", type=float, default=0.1) From fc1eb998676ab4086c605d976e3cda19aa926e82 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 6 Dec 2019 13:41:24 +0800 Subject: [PATCH 25/57] update --- examples/nas/spos/scratch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 2cb7f5c7fb..651209aa9c 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -70,7 +70,7 @@ def validate(epoch, model, criterion, loader, writer, args): parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") parser.add_argument("--tb-dir", type=str, default="runs") parser.add_argument("--architecture", type=str, default="architecture_final.json") - parser.add_argument("--workers", type=int, default=8) + parser.add_argument("--workers", type=int, default=12) parser.add_argument("--batch-size", type=int, default=1024) parser.add_argument("--epochs", type=int, default=240) parser.add_argument("--learning-rate", type=float, default=0.5) From 34ffa314e3ed3876974d68098015c66e3a6b347e Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 6 Dec 2019 14:50:22 +0000 Subject: [PATCH 26/57] fix cur_step error --- examples/nas/spos/scratch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 651209aa9c..2aae81c197 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -58,9 +58,9 @@ def validate(epoch, model, criterion, loader, writer, args): logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, args.epochs, step + 1, len(loader), meters) - writer.add_scalar("loss/test", meters.loss.avg, global_step=cur_step) - writer.add_scalar("acc1/test", meters.acc1.avg, global_step=cur_step) - writer.add_scalar("acc5/test", meters.acc5.avg, global_step=cur_step) + writer.add_scalar("loss/test", meters.loss.avg, global_step=epoch) + writer.add_scalar("acc1/test", meters.acc1.avg, global_step=epoch) + writer.add_scalar("acc5/test", meters.acc5.avg, global_step=epoch) logger.info("Epoch %d validation: %s", epoch + 1, meters) From ef387c9f66af8839c8b188bfa134beb9ecdd773e Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 7 Dec 2019 01:35:13 +0800 Subject: [PATCH 27/57] update --- examples/nas/spos/dataloader.py | 81 +++++++++++++++++++++++++++++++++ examples/nas/spos/scratch.py | 43 +++++++++-------- 2 files changed, 102 insertions(+), 22 deletions(-) create mode 100644 examples/nas/spos/dataloader.py diff --git a/examples/nas/spos/dataloader.py b/examples/nas/spos/dataloader.py new file mode 100644 index 0000000000..76692d6726 --- /dev/null +++ b/examples/nas/spos/dataloader.py @@ -0,0 +1,81 @@ +import time +import torch.utils.data +import nvidia.dali.ops as ops +import nvidia.dali.types as types +import torchvision.datasets as datasets +from nvidia.dali.pipeline import Pipeline +import torchvision.transforms as transforms +from nvidia.dali.plugin.pytorch import DALIClassificationIterator, DALIGenericIterator + + +class HybridTrainPipe(Pipeline): + def __init__(self, batch_size, num_threads, device_id, data_dir, crop, dali_cpu=False, local_rank=0, world_size=1): + super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id) + dali_device = "gpu" + self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size, random_shuffle=True) + self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) + self.res = ops.RandomResizedCrop(device="gpu", size=crop, random_area=[0.08, 1.25]) + self.twist = ops.ColorTwist(device="gpu") + self.jitter_rng = ops.Uniform(range=[0.6, 1.4]) + self.cmnp = ops.CropMirrorNormalize(device="gpu", + output_dtype=types.FLOAT, + output_layout=types.NCHW, + image_type=types.RGB, + mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], + std=[0.229 * 255, 0.224 * 255, 0.225 * 255]) + self.coin = ops.CoinFlip(probability=0.5) + + def define_graph(self): + rng = self.coin() + saturation = self.jitter_rng() + contrast = self.jitter_rng() + brightness = self.jitter_rng() + self.jpegs, self.labels = self.input(name="Reader") + images = self.decode(self.jpegs) + images = self.res(images) + images = self.twist(images, saturation=saturation, contrast=contrast, brightness=brightness) + output = self.cmnp(images, mirror=rng) + return [output, self.labels] + + +class HybridValPipe(Pipeline): + def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size, local_rank=0, world_size=1): + super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id) + self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size, + random_shuffle=False) + self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) + self.res = ops.Resize(device="gpu", resize_shorter=size, interp_type=types.INTERP_TRIANGULAR) + self.cmnp = ops.CropMirrorNormalize(device="gpu", + output_dtype=types.FLOAT, + output_layout=types.NCHW, + crop=(crop, crop), + image_type=types.RGB, + mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], + std=[0.229 * 255, 0.224 * 255, 0.225 * 255]) + + def define_graph(self): + self.jpegs, self.labels = self.input(name="Reader") + images = self.decode(self.jpegs) + images = self.res(images) + output = self.cmnp(images) + return [output, self.labels] + + +def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, + crop=224, val_size=256): + world_size = 1 + local_rank = device_id = 0 + if split == 'train': + pipeline = HybridTrainPipe(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, + data_dir=image_dir + '/train', + crop=crop, world_size=world_size, local_rank=local_rank) + elif split == 'val': + pipeline = HybridValPipe(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, + data_dir=image_dir + '/val', + crop=crop, size=val_size, world_size=world_size, local_rank=local_rank) + else: + raise AssertionError + pipeline.build() + num_samples = pipeline.epoch_size("Reader") + return DALIClassificationIterator(pipeline, size=num_samples // world_size), \ + (num_samples + batch_size - 1) // batch_size diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 2aae81c197..7a27bd4fb8 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -5,23 +5,23 @@ import torch.nn as nn from nni.nas.pytorch.fixed import apply_fixed_architecture from nni.nas.pytorch.utils import AverageMeterGroup -from torch.utils.data.dataloader import DataLoader from torch.utils.tensorboard import SummaryWriter +from dataloader import get_imagenet_iter_dali from network import ShuffleNetV2OneShot, load_and_parse_state_dict -from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy +from utils import CrossEntropyLabelSmooth, accuracy logger = logging.getLogger("nni") -def train(epoch, model, criterion, optimizer, loader, writer, args): +def train(epoch, model, criterion, optimizer, loader, writer, args, num_iters): model.train() meters = AverageMeterGroup() cur_lr = optimizer.param_groups[0]["lr"] - for step, (x, y) in enumerate(loader): - cur_step = len(loader) * epoch + step - x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True) + for step, data in enumerate(loader): + cur_step = num_iters * epoch + step + x, y = data[0]["data"], data[0]["label"].view(-1).long().cuda(non_blocking=True) optimizer.zero_grad() logits = model(x) loss = criterion(logits, y) @@ -37,26 +37,28 @@ def train(epoch, model, criterion, optimizer, loader, writer, args): writer.add_scalar("acc1/train", metrics["acc1"], global_step=cur_step) writer.add_scalar("acc5/train", metrics["acc5"], global_step=cur_step) - if step % args.log_frequency == 0 or step + 1 == len(loader): - logger.info("Epoch [%s/%s] Step [%s/%s] %s", epoch + 1, - args.epochs, step + 1, len(loader), meters) + if step % args.log_frequency == 0 or step + 1 == num_iters: + logger.info("Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, + args.epochs, step + 1, num_iters, meters) + logger.info("Epoch %d training summary: %s", epoch + 1, meters) -def validate(epoch, model, criterion, loader, writer, args): + +def validate(epoch, model, criterion, loader, writer, args, num_iters): model.eval() meters = AverageMeterGroup() with torch.no_grad(): - for step, (x, y) in enumerate(loader): - x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True) + for step, data in enumerate(loader): + x, y = data[0]["data"], data[0]["label"].view(-1).long().cuda(non_blocking=True) logits = model(x) loss = criterion(logits, y) metrics = accuracy(logits, y) metrics["loss"] = loss.item() meters.update(metrics) - if step % args.log_frequency == 0 or step + 1 == len(loader): - logger.info("Epoch [%s/%s] Validation Step [%s/%s] %s", epoch + 1, - args.epochs, step + 1, len(loader), meters) + if step % args.log_frequency == 0 or step + 1 == num_iters: + logger.info("Epoch [%d/%d] Validation Step [%d/%d] %s", epoch + 1, + args.epochs, step + 1, num_iters, meters) writer.add_scalar("loss/test", meters.loss.avg, global_step=epoch) writer.add_scalar("acc1/test", meters.acc1.avg, global_step=epoch) @@ -80,11 +82,8 @@ def validate(epoch, model, criterion, loader, writer, args): parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() - dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, False) - train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, - num_workers=args.workers, pin_memory=True) - valid_loader = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=False, - num_workers=args.workers, pin_memory=True) + train_loader, train_iters = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers) + valid_loader, valid_iters = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers) model = ShuffleNetV2OneShot() model.cuda() apply_fixed_architecture(model, args.architecture) @@ -99,7 +98,7 @@ def validate(epoch, model, criterion, loader, writer, args): writer = SummaryWriter(log_dir=args.tb_dir) for epoch in range(args.epochs): - train(epoch, model, criterion, optimizer, train_loader, writer, args) - validate(epoch, model, criterion, valid_loader, writer, args) + train(epoch, model, criterion, optimizer, train_loader, writer, args, train_iters) + validate(epoch, model, criterion, valid_loader, writer, args, valid_iters) scheduler.step() writer.close() From 082abcdf6a3665bf4833c9117b24e586f33c62b0 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 7 Dec 2019 09:43:04 +0800 Subject: [PATCH 28/57] update --- examples/nas/spos/scratch.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 7a27bd4fb8..515e1ac062 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -14,7 +14,9 @@ logger = logging.getLogger("nni") -def train(epoch, model, criterion, optimizer, loader, writer, args, num_iters): +def train(epoch, model, criterion, optimizer, writer, args): + loader, num_iters = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers) + model.train() meters = AverageMeterGroup() cur_lr = optimizer.param_groups[0]["lr"] @@ -44,7 +46,9 @@ def train(epoch, model, criterion, optimizer, loader, writer, args, num_iters): logger.info("Epoch %d training summary: %s", epoch + 1, meters) -def validate(epoch, model, criterion, loader, writer, args, num_iters): +def validate(epoch, model, criterion, writer, args): + loader, num_iters = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers) + model.eval() meters = AverageMeterGroup() with torch.no_grad(): @@ -82,8 +86,7 @@ def validate(epoch, model, criterion, loader, writer, args, num_iters): parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() - train_loader, train_iters = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers) - valid_loader, valid_iters = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers) + model = ShuffleNetV2OneShot() model.cuda() apply_fixed_architecture(model, args.architecture) @@ -98,7 +101,7 @@ def validate(epoch, model, criterion, loader, writer, args, num_iters): writer = SummaryWriter(log_dir=args.tb_dir) for epoch in range(args.epochs): - train(epoch, model, criterion, optimizer, train_loader, writer, args, train_iters) - validate(epoch, model, criterion, valid_loader, writer, args, valid_iters) + train(epoch, model, criterion, optimizer, writer, args) + validate(epoch, model, criterion, writer, args) scheduler.step() writer.close() From 28c5b2ddca99d2e572035167e34998f621b58fe4 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 7 Dec 2019 14:13:38 +0800 Subject: [PATCH 29/57] update --- examples/nas/spos/dataloader.py | 24 ++++++++++++------------ examples/nas/spos/scratch.py | 23 +++++++++++++---------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/examples/nas/spos/dataloader.py b/examples/nas/spos/dataloader.py index 76692d6726..38805996aa 100644 --- a/examples/nas/spos/dataloader.py +++ b/examples/nas/spos/dataloader.py @@ -1,3 +1,4 @@ +import os import time import torch.utils.data import nvidia.dali.ops as ops @@ -9,12 +10,11 @@ class HybridTrainPipe(Pipeline): - def __init__(self, batch_size, num_threads, device_id, data_dir, crop, dali_cpu=False, local_rank=0, world_size=1): + def __init__(self, batch_size, num_threads, device_id, data_dir, crop, local_rank=0, world_size=1): super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id) - dali_device = "gpu" self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size, random_shuffle=True) self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) - self.res = ops.RandomResizedCrop(device="gpu", size=crop, random_area=[0.08, 1.25]) + self.res = ops.RandomResizedCrop(device="gpu", size=crop) self.twist = ops.ColorTwist(device="gpu") self.jitter_rng = ops.Uniform(range=[0.6, 1.4]) self.cmnp = ops.CropMirrorNormalize(device="gpu", @@ -63,19 +63,19 @@ def define_graph(self): def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, crop=224, val_size=256): - world_size = 1 - local_rank = device_id = 0 - if split == 'train': - pipeline = HybridTrainPipe(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, - data_dir=image_dir + '/train', + world_size, local_rank = 1, 0 + device_id = torch.cuda.device_count() - 1 # use last gpu + if split == "train": + pipeline = HybridTrainPipe(batch_size=batch_size, num_threads=num_threads, device_id=device_id, + data_dir=os.path.join(image_dir, "train"), crop=crop, world_size=world_size, local_rank=local_rank) - elif split == 'val': - pipeline = HybridValPipe(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, - data_dir=image_dir + '/val', + elif split == "val": + pipeline = HybridValPipe(batch_size=batch_size, num_threads=num_threads, device_id=device_id, + data_dir=os.path.join(image_dir, "val"), crop=crop, size=val_size, world_size=world_size, local_rank=local_rank) else: raise AssertionError pipeline.build() num_samples = pipeline.epoch_size("Reader") - return DALIClassificationIterator(pipeline, size=num_samples // world_size), \ + return DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train"), \ (num_samples + batch_size - 1) // batch_size diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 515e1ac062..a15500b96f 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -14,9 +14,7 @@ logger = logging.getLogger("nni") -def train(epoch, model, criterion, optimizer, writer, args): - loader, num_iters = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers) - +def train(epoch, model, criterion, optimizer, loader, num_iters, writer, args): model.train() meters = AverageMeterGroup() cur_lr = optimizer.param_groups[0]["lr"] @@ -46,9 +44,7 @@ def train(epoch, model, criterion, optimizer, writer, args): logger.info("Epoch %d training summary: %s", epoch + 1, meters) -def validate(epoch, model, criterion, writer, args): - loader, num_iters = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers) - +def validate(epoch, model, criterion, loader, num_iters, writer, args): model.eval() meters = AverageMeterGroup() with torch.no_grad(): @@ -68,7 +64,7 @@ def validate(epoch, model, criterion, writer, args): writer.add_scalar("acc1/test", meters.acc1.avg, global_step=epoch) writer.add_scalar("acc5/test", meters.acc5.avg, global_step=epoch) - logger.info("Epoch %d validation: %s", epoch + 1, meters) + logger.info("Epoch %d validation: top1 = %f, top5 = %f", epoch + 1, meters.acc1.avg, meters.acc5.avg) if __name__ == "__main__": @@ -90,7 +86,8 @@ def validate(epoch, model, criterion, writer, args): model = ShuffleNetV2OneShot() model.cuda() apply_fixed_architecture(model, args.architecture) - model = nn.DataParallel(model) + if torch.cuda.device_count() > 1: # exclude last gpu, saving for data preprocessing on gpu + model = nn.DataParallel(model, device_ids=list(range(0, torch.cuda.device_count() - 1))) criterion = CrossEntropyLabelSmooth(1000, 0.1) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) @@ -100,8 +97,14 @@ def validate(epoch, model, criterion, writer, args): last_epoch=-1) writer = SummaryWriter(log_dir=args.tb_dir) + train_loader, train_iters = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers) + val_loader, val_iters = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers) + for epoch in range(args.epochs): - train(epoch, model, criterion, optimizer, writer, args) - validate(epoch, model, criterion, writer, args) + train(epoch, model, criterion, optimizer, train_loader, train_iters, writer, args) + validate(epoch, model, criterion, val_loader, val_iters, writer, args) scheduler.step() + train_loader.reset() + val_loader.reset() + writer.close() From ff2d2e71b14d1d989a513285b3270aea3f7283d1 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 9 Dec 2019 10:34:04 +0800 Subject: [PATCH 30/57] update format --- examples/nas/spos/dataloader.py | 10 ++++------ examples/nas/spos/network.py | 3 +-- examples/nas/spos/scratch.py | 6 +++--- examples/nas/spos/supernet.py | 7 +++---- examples/nas/spos/tester.py | 1 + examples/nas/spos/utils.py | 19 ++++++++----------- 6 files changed, 20 insertions(+), 26 deletions(-) diff --git a/examples/nas/spos/dataloader.py b/examples/nas/spos/dataloader.py index 38805996aa..64b42bae8c 100644 --- a/examples/nas/spos/dataloader.py +++ b/examples/nas/spos/dataloader.py @@ -1,12 +1,10 @@ import os -import time -import torch.utils.data + import nvidia.dali.ops as ops import nvidia.dali.types as types -import torchvision.datasets as datasets +import torch.utils.data from nvidia.dali.pipeline import Pipeline -import torchvision.transforms as transforms -from nvidia.dali.plugin.pytorch import DALIClassificationIterator, DALIGenericIterator +from nvidia.dali.plugin.pytorch import DALIClassificationIterator class HybridTrainPipe(Pipeline): @@ -78,4 +76,4 @@ def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, pipeline.build() num_samples = pipeline.epoch_size("Reader") return DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train"), \ - (num_samples + batch_size - 1) // batch_size + (num_samples + batch_size - 1) // batch_size diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 14ae871cfd..9158513f4a 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -4,11 +4,10 @@ import torch import torch.nn as nn +from nni.nas.pytorch import mutables from blocks import ShuffleNetBlock, ShuffleXceptionBlock -from nni.nas.pytorch import mutables - class ShuffleNetV2OneShot(nn.Module): block_keys = [ diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index a15500b96f..59ad748226 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -3,12 +3,12 @@ import torch import torch.nn as nn +from dataloader import get_imagenet_iter_dali from nni.nas.pytorch.fixed import apply_fixed_architecture from nni.nas.pytorch.utils import AverageMeterGroup from torch.utils.tensorboard import SummaryWriter -from dataloader import get_imagenet_iter_dali -from network import ShuffleNetV2OneShot, load_and_parse_state_dict +from network import ShuffleNetV2OneShot from utils import CrossEntropyLabelSmooth, accuracy logger = logging.getLogger("nni") @@ -82,7 +82,7 @@ def validate(epoch, model, criterion, loader, num_iters, writer, args): parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() - + model = ShuffleNetV2OneShot() model.cuda() apply_fixed_architecture(model, args.architecture) diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index 24711b36a6..6d5160a088 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -2,13 +2,12 @@ import torch import torch.nn as nn -from nni.nas.pytorch.callbacks import Callback, LRSchedulerCallback - -from network import ShuffleNetV2OneShot, load_and_parse_state_dict +from nni.nas.pytorch.callbacks import LRSchedulerCallback from nni.nas.pytorch.callbacks import ModelCheckpoint from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer -from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy +from network import ShuffleNetV2OneShot, load_and_parse_state_dict +from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index f0b5658d32..5ddb187bba 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -1,5 +1,6 @@ import argparse import logging +import random from itertools import cycle import nni diff --git a/examples/nas/spos/utils.py b/examples/nas/spos/utils.py index 5a7ab1941c..eaed7128e6 100644 --- a/examples/nas/spos/utils.py +++ b/examples/nas/spos/utils.py @@ -1,15 +1,12 @@ import os +import numpy as np import torch import torch.nn as nn - from PIL import Image -import numpy as np - from torchvision import transforms from torchvision.datasets import ImageNet - IMAGENET_MEAN = [0.485, 0.456, 0.406] IMAGENET_STD = [0.229, 0.224, 0.225] @@ -57,14 +54,14 @@ def get_imagenet(imagenet_root, spos_pre): ] train_transform = transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), - transforms.RandomHorizontalFlip(0.5), - ] + postprocess) + transforms.RandomResizedCrop(224), + transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), + transforms.RandomHorizontalFlip(0.5), + ] + postprocess) valid_transform = transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - ] + postprocess) + transforms.Resize(256), + transforms.CenterCrop(224), + ] + postprocess) train_dataset = ImageNet(imagenet_root, split="train", transform=train_transform) valid_dataset = ImageNet(imagenet_root, split="val", transform=valid_transform) return train_dataset, valid_dataset From c034b0aeb1617ef02d9f0db904910f7939da1fdd Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 11 Dec 2019 13:32:49 +0800 Subject: [PATCH 31/57] update --- examples/nas/spos/config_search.yml | 4 +- examples/nas/spos/dataloader.py | 54 +++++++++++------- examples/nas/spos/evolution.py | 21 +++---- examples/nas/spos/scratch.py | 6 +- examples/nas/spos/tester.py | 87 +++++++++++++---------------- 5 files changed, 86 insertions(+), 86 deletions(-) diff --git a/examples/nas/spos/config_search.yml b/examples/nas/spos/config_search.yml index 1625c8f431..a89a0d397c 100644 --- a/examples/nas/spos/config_search.yml +++ b/examples/nas/spos/config_search.yml @@ -1,6 +1,6 @@ authorName: unknown experimentName: SPOS Search -trialConcurrency: 2 +trialConcurrency: 4 maxExecDuration: 7d maxTrialNum: 99999 trainingServicePlatform: local @@ -12,6 +12,6 @@ tuner: className: SPOSEvolution trial: # to fit in a GTX 1080 - command: python tester.py --imagenet-dir /data/ssd1/v-yugzh/imagenet --spos-prep --workers 6 + command: python tester.py --imagenet-dir /data/ssd1/v-yugzh/imagenet --spos-prep codeDir: . gpuNum: 1 diff --git a/examples/nas/spos/dataloader.py b/examples/nas/spos/dataloader.py index 64b42bae8c..86a9b7a45d 100644 --- a/examples/nas/spos/dataloader.py +++ b/examples/nas/spos/dataloader.py @@ -8,19 +8,21 @@ class HybridTrainPipe(Pipeline): - def __init__(self, batch_size, num_threads, device_id, data_dir, crop, local_rank=0, world_size=1): - super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id) + def __init__(self, batch_size, num_threads, device_id, data_dir, crop, seed=12, local_rank=0, world_size=1, + spos_pre=False): + super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=seed + device_id) + color_space_type = types.BGR if spos_pre else types.RGB self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size, random_shuffle=True) - self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) + self.decode = ops.ImageDecoder(device="mixed", output_type=color_space_type) self.res = ops.RandomResizedCrop(device="gpu", size=crop) self.twist = ops.ColorTwist(device="gpu") self.jitter_rng = ops.Uniform(range=[0.6, 1.4]) self.cmnp = ops.CropMirrorNormalize(device="gpu", output_dtype=types.FLOAT, output_layout=types.NCHW, - image_type=types.RGB, - mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], - std=[0.229 * 255, 0.224 * 255, 0.225 * 255]) + image_type=color_space_type, + mean=0. if spos_pre else [0.485 * 255, 0.456 * 255, 0.406 * 255], + std=1. if spos_pre else [0.229 * 255, 0.224 * 255, 0.225 * 255]) self.coin = ops.CoinFlip(probability=0.5) def define_graph(self): @@ -37,19 +39,21 @@ def define_graph(self): class HybridValPipe(Pipeline): - def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size, local_rank=0, world_size=1): - super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id) + def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size, seed=12, local_rank=0, world_size=1, + spos_pre=False, shuffle=False): + super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed=seed + device_id) + color_space_type = types.BGR if spos_pre else types.RGB self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size, - random_shuffle=False) - self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB) + random_shuffle=shuffle) + self.decode = ops.ImageDecoder(device="mixed", output_type=color_space_type) self.res = ops.Resize(device="gpu", resize_shorter=size, interp_type=types.INTERP_TRIANGULAR) self.cmnp = ops.CropMirrorNormalize(device="gpu", output_dtype=types.FLOAT, output_layout=types.NCHW, crop=(crop, crop), - image_type=types.RGB, - mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], - std=[0.229 * 255, 0.224 * 255, 0.225 * 255]) + image_type=color_space_type, + mean=0. if spos_pre else [0.485 * 255, 0.456 * 255, 0.406 * 255], + std=1. if spos_pre else [0.229 * 255, 0.224 * 255, 0.225 * 255]) def define_graph(self): self.jpegs, self.labels = self.input(name="Reader") @@ -59,21 +63,29 @@ def define_graph(self): return [output, self.labels] -def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, - crop=224, val_size=256): +def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, crop=224, val_size=256, auto_reset=False, + spos_preprocessing=False, seed=12, shuffle=False, device_id=None): world_size, local_rank = 1, 0 - device_id = torch.cuda.device_count() - 1 # use last gpu + if device_id is None: + device_id = torch.cuda.device_count() - 1 # use last gpu if split == "train": pipeline = HybridTrainPipe(batch_size=batch_size, num_threads=num_threads, device_id=device_id, - data_dir=os.path.join(image_dir, "train"), - crop=crop, world_size=world_size, local_rank=local_rank) + data_dir=os.path.join(image_dir, "train"), seed=seed, + crop=crop, world_size=world_size, local_rank=local_rank, + spos_pre=spos_preprocessing) elif split == "val": pipeline = HybridValPipe(batch_size=batch_size, num_threads=num_threads, device_id=device_id, - data_dir=os.path.join(image_dir, "val"), - crop=crop, size=val_size, world_size=world_size, local_rank=local_rank) + data_dir=os.path.join(image_dir, "val"), seed=seed, + crop=crop, size=val_size, world_size=world_size, local_rank=local_rank, + spos_pre=spos_preprocessing, shuffle=shuffle) else: raise AssertionError pipeline.build() num_samples = pipeline.epoch_size("Reader") - return DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train"), \ + return DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train", + auto_reset=auto_reset), \ (num_samples + batch_size - 1) // batch_size + + +def convert_data_format_dali(data): + return data[0]["data"], data[0]["label"].view(-1).long().cuda(non_blocking=True) diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py index 2dfd804e67..6d1df54022 100644 --- a/examples/nas/spos/evolution.py +++ b/examples/nas/spos/evolution.py @@ -45,11 +45,11 @@ def _next_round(self): return _logger.info("Epoch %d, generating...", self.epoch) if self.epoch == 0: - self.candidates = self._get_random_population() + self._get_random_population() else: best_candidates = self._select_top_candidates() - self.candidates = self._get_mutation(best_candidates) + self._get_crossover(best_candidates) + \ - self._get_random_population() + self.candidates = self._get_mutation(best_candidates) + self._get_crossover(best_candidates) + self._get_random_population() self.epoch += 1 def _random_candidate(self): @@ -69,15 +69,12 @@ def _add_to_evaluate_queue(self, cand): self._to_evaluate_queue.append(cand) def _get_random_population(self): - result = [] - for _ in range(self.num_population): - while True: - cand = self._random_candidate() - if self._is_legal(cand): - result.append(cand) - self._add_to_evaluate_queue(cand) - break - return result + while len(self.candidates) < self.num_population: + cand = self._random_candidate() + if self._is_legal(cand): + _logger.info("Random candidate generated.") + self._add_to_evaluate_queue(cand) + self.candidates.append(cand) def _get_crossover(self, best): result = [] diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 59ad748226..8891c233cb 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -3,7 +3,7 @@ import torch import torch.nn as nn -from dataloader import get_imagenet_iter_dali +from dataloader import get_imagenet_iter_dali, convert_data_format_dali from nni.nas.pytorch.fixed import apply_fixed_architecture from nni.nas.pytorch.utils import AverageMeterGroup from torch.utils.tensorboard import SummaryWriter @@ -21,7 +21,7 @@ def train(epoch, model, criterion, optimizer, loader, num_iters, writer, args): for step, data in enumerate(loader): cur_step = num_iters * epoch + step - x, y = data[0]["data"], data[0]["label"].view(-1).long().cuda(non_blocking=True) + x, y = convert_data_format_dali(data) optimizer.zero_grad() logits = model(x) loss = criterion(logits, y) @@ -49,7 +49,7 @@ def validate(epoch, model, criterion, loader, num_iters, writer, args): meters = AverageMeterGroup() with torch.no_grad(): for step, data in enumerate(loader): - x, y = data[0]["data"], data[0]["label"].view(-1).long().cuda(non_blocking=True) + x, y = convert_data_format_dali(data) logits = model(x) loss = criterion(logits, y) metrics = accuracy(logits, y) diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index 5ddb187bba..aafdd1f0f3 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -9,28 +9,27 @@ import torch.nn as nn from nni.nas.pytorch.classic_nas import get_and_apply_next_architecture from nni.nas.pytorch.utils import AverageMeterGroup -from torch.utils.data import DataLoader, SubsetRandomSampler +from dataloader import get_imagenet_iter_dali, convert_data_format_dali from network import ShuffleNetV2OneShot, load_and_parse_state_dict -from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy +from utils import CrossEntropyLabelSmooth, accuracy logger = logging.getLogger("nni") -def retrain_bn(model, criterion, max_iters, log_freq, loader_train, device): +def retrain_bn(model, criterion, max_iters, log_freq, loader): with torch.no_grad(): - logger.info("Clear BN statistics...") - for m in model.modules(): - if isinstance(m, nn.BatchNorm2d): - m.running_mean = torch.zeros_like(m.running_mean) - m.running_var = torch.ones_like(m.running_var) + # logger.info("Clear BN statistics...") + # for m in model.modules(): + # if isinstance(m, nn.BatchNorm2d): + # m.running_mean = torch.zeros_like(m.running_mean) + # m.running_var = torch.ones_like(m.running_var) logger.info("Train BN with training set (BN sanitize)...") model.train() meters = AverageMeterGroup() for step in range(max_iters): - inputs, targets = next(loader_train) - inputs, targets = inputs.to(device), targets.to(device) + inputs, targets = convert_data_format_dali(next(loader)) logits = model(inputs) loss = criterion(logits, targets) metrics = accuracy(logits, targets) @@ -40,39 +39,34 @@ def retrain_bn(model, criterion, max_iters, log_freq, loader_train, device): logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters) -def test_acc(model, criterion, max_iters, log_freq, loader_test, device): +def test_acc(model, criterion, max_iters, log_freq, loader): logger.info("Start testing...") model.eval() meters = AverageMeterGroup() with torch.no_grad(): - for step in range(max_iters): - inputs, targets = next(loader_test) - inputs, targets = inputs.to(device), targets.to(device) + for step, data in enumerate(loader): + inputs, targets = convert_data_format_dali(data) logits = model(inputs) loss = criterion(logits, targets) metrics = accuracy(logits, targets) metrics["loss"] = loss.item() meters.update(metrics) - if step % log_freq == 0 or step + 1 == max_iters: + if step % log_freq == 0: logger.info("Valid Step [%d/%d] %s", step + 1, max_iters, meters) return meters.acc1.avg -def evaluate_acc(model, criterion, args, loader_train, loader_test, device): - acc_before = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test, device) +def evaluate_acc(model, criterion, args, loader_train, loader_test): + acc_before = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test) nni.report_intermediate_result(acc_before) - retrain_bn(model, criterion, args.train_iters, args.log_frequency, loader_train, device) - acc = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test, device) + retrain_bn(model, criterion, args.train_iters, args.log_frequency, loader_train) + acc = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test) assert isinstance(acc, float) + nni.report_intermediate_result(acc) nni.report_final_result(acc) -def generate_subset_indices(dataset, batch_size, iters): - dataset_length = len(dataset) - return np.random.choice(dataset_length, min(batch_size * iters * 2, dataset_length), replace=False) - - if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Candidate Tester") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") @@ -80,40 +74,37 @@ def generate_subset_indices(dataset, batch_size, iters): parser.add_argument("--spos-preprocessing", action="store_true", default=False, help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") - parser.add_argument("--deterministic", action="store_true", default=False) + parser.add_argument("--seed", type=int, default=42) parser.add_argument("--workers", type=int, default=4) parser.add_argument("--train-batch-size", type=int, default=128) parser.add_argument("--train-iters", type=int, default=200) - parser.add_argument("--test-batch-size", type=int, default=1024) - parser.add_argument("--test-iters", type=int, default=10) + parser.add_argument("--test-batch-size", type=int, default=512) + parser.add_argument("--test-iters", type=int, default=40) parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() - if args.deterministic: - # use a fixed set of image will improve the performance - torch.manual_seed(0) - torch.cuda.manual_seed_all(0) - np.random.seed(0) - random.seed(0) - torch.backends.cudnn.deterministic = True + # use a fixed set of image will improve the performance + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + np.random.seed(args.seed) + random.seed(args.seed) + torch.backends.cudnn.deterministic = True - use_gpu = torch.cuda.is_available() - device = torch.device("cuda") if use_gpu else torch.device("cpu") + assert torch.cuda.is_available() model = ShuffleNetV2OneShot() criterion = CrossEntropyLabelSmooth(1000, 0.1) get_and_apply_next_architecture(model) model.load_state_dict(load_and_parse_state_dict(filepath=args.checkpoint)) - model.to(device) - - dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) - sampler_train = SubsetRandomSampler(generate_subset_indices(dataset_train, args.train_batch_size, args.train_iters)) - sampler_valid = SubsetRandomSampler(generate_subset_indices(dataset_valid, args.test_batch_size, args.test_iters)) - loader_train = DataLoader(dataset_train, batch_size=args.train_batch_size, - sampler=sampler_train, num_workers=args.workers) - loader_valid = DataLoader(dataset_valid, batch_size=args.test_batch_size, - sampler=sampler_valid, num_workers=args.workers) - loader_train, loader_valid = cycle(loader_train), cycle(loader_valid) - - evaluate_acc(model, criterion, args, loader_train, loader_valid, device) + model.cuda() + + train_loader, train_iters = get_imagenet_iter_dali("train", args.imagenet_dir, args.train_batch_size, args.workers, + auto_reset=True, spos_preprocessing=args.spos_preprocessing, + seed=args.seed, device_id=0) + val_loader, val_iters = get_imagenet_iter_dali("val", args.imagenet_dir, args.test_batch_size, args.workers, + spos_preprocessing=args.spos_preprocessing, shuffle=True, + seed=args.seed, device_id=0, auto_reset=True) + train_loader = cycle(train_loader) + + evaluate_acc(model, criterion, args, train_loader, val_loader) From 8f773214983bf5ad0686f6878d465957c463e54b Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Thu, 12 Dec 2019 17:11:15 +0800 Subject: [PATCH 32/57] update --- examples/nas/spos/tester.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index aafdd1f0f3..c1e7fb02b9 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -1,6 +1,7 @@ import argparse import logging import random +import time from itertools import cycle import nni @@ -39,10 +40,11 @@ def retrain_bn(model, criterion, max_iters, log_freq, loader): logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters) -def test_acc(model, criterion, max_iters, log_freq, loader): +def test_acc(model, criterion, log_freq, loader): logger.info("Start testing...") model.eval() meters = AverageMeterGroup() + start_time = time.time() with torch.no_grad(): for step, data in enumerate(loader): inputs, targets = convert_data_format_dali(data) @@ -52,16 +54,18 @@ def test_acc(model, criterion, max_iters, log_freq, loader): metrics["loss"] = loss.item() meters.update(metrics) if step % log_freq == 0: - logger.info("Valid Step [%d/%d] %s", step + 1, max_iters, meters) + logger.info("Valid Step [%d] time %.3fs acc1 %.4f acc5 %.4f loss %.4f", + step + 1, time.time() - start_time, + meters.acc1.avg, meters.acc5.avg, meters.loss.avg) return meters.acc1.avg def evaluate_acc(model, criterion, args, loader_train, loader_test): - acc_before = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test) + acc_before = test_acc(model, criterion, args.log_frequency, loader_test) nni.report_intermediate_result(acc_before) retrain_bn(model, criterion, args.train_iters, args.log_frequency, loader_train) - acc = test_acc(model, criterion, args.test_iters, args.log_frequency, loader_test) + acc = test_acc(model, criterion, args.log_frequency, loader_test) assert isinstance(acc, float) nni.report_intermediate_result(acc) nni.report_final_result(acc) @@ -75,11 +79,10 @@ def evaluate_acc(model, criterion, args, loader_train, loader_test): help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--workers", type=int, default=4) + parser.add_argument("--workers", type=int, default=6) parser.add_argument("--train-batch-size", type=int, default=128) parser.add_argument("--train-iters", type=int, default=200) parser.add_argument("--test-batch-size", type=int, default=512) - parser.add_argument("--test-iters", type=int, default=40) parser.add_argument("--log-frequency", type=int, default=10) args = parser.parse_args() From 1f2996040f50172edaad0a2e8336ec647a80f6eb Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 13 Dec 2019 13:08:51 +0800 Subject: [PATCH 33/57] update --- examples/nas/spos/config_search.yml | 2 +- examples/nas/spos/dataloader.py | 12 ++++---- examples/nas/spos/evolution.py | 1 + examples/nas/spos/readme.md | 37 ++++++++++++++++++++++- src/sdk/pynni/nni/nas/pytorch/decision.py | 5 +++ 5 files changed, 49 insertions(+), 8 deletions(-) diff --git a/examples/nas/spos/config_search.yml b/examples/nas/spos/config_search.yml index a89a0d397c..ca57a25930 100644 --- a/examples/nas/spos/config_search.yml +++ b/examples/nas/spos/config_search.yml @@ -11,7 +11,7 @@ tuner: classFileName: evolution.py className: SPOSEvolution trial: - # to fit in a GTX 1080 + # TODO: change the imagenet dir before release. command: python tester.py --imagenet-dir /data/ssd1/v-yugzh/imagenet --spos-prep codeDir: . gpuNum: 1 diff --git a/examples/nas/spos/dataloader.py b/examples/nas/spos/dataloader.py index 86a9b7a45d..d2a6795ed8 100644 --- a/examples/nas/spos/dataloader.py +++ b/examples/nas/spos/dataloader.py @@ -14,7 +14,8 @@ def __init__(self, batch_size, num_threads, device_id, data_dir, crop, seed=12, color_space_type = types.BGR if spos_pre else types.RGB self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size, random_shuffle=True) self.decode = ops.ImageDecoder(device="mixed", output_type=color_space_type) - self.res = ops.RandomResizedCrop(device="gpu", size=crop) + self.res = ops.RandomResizedCrop(device="gpu", size=crop, + interp_type=types.INTERP_LINEAR if spos_pre else types.INTERP_TRIANGULAR) self.twist = ops.ColorTwist(device="gpu") self.jitter_rng = ops.Uniform(range=[0.6, 1.4]) self.cmnp = ops.CropMirrorNormalize(device="gpu", @@ -27,13 +28,11 @@ def __init__(self, batch_size, num_threads, device_id, data_dir, crop, seed=12, def define_graph(self): rng = self.coin() - saturation = self.jitter_rng() - contrast = self.jitter_rng() - brightness = self.jitter_rng() self.jpegs, self.labels = self.input(name="Reader") images = self.decode(self.jpegs) images = self.res(images) - images = self.twist(images, saturation=saturation, contrast=contrast, brightness=brightness) + images = self.twist(images, saturation=self.jitter_rng(), + contrast=self.jitter_rng(), brightness=self.jitter_rng()) output = self.cmnp(images, mirror=rng) return [output, self.labels] @@ -46,7 +45,8 @@ def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size, see self.input = ops.FileReader(file_root=data_dir, shard_id=local_rank, num_shards=world_size, random_shuffle=shuffle) self.decode = ops.ImageDecoder(device="mixed", output_type=color_space_type) - self.res = ops.Resize(device="gpu", resize_shorter=size, interp_type=types.INTERP_TRIANGULAR) + self.res = ops.Resize(device="gpu", resize_shorter=size, + interp_type=types.INTERP_LINEAR if spos_pre else types.INTERP_TRIANGULAR) self.cmnp = ops.CropMirrorNormalize(device="gpu", output_dtype=types.FLOAT, output_layout=types.NCHW, diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py index 6d1df54022..83a8aa68e9 100644 --- a/examples/nas/spos/evolution.py +++ b/examples/nas/spos/evolution.py @@ -119,6 +119,7 @@ def _is_legal(self, cand): def _select_top_candidates(self): reward_query = lambda cand: self._reward_dict[self._hashcode(cand)] + _logger.info("All candidate rewards: %s", list(map(reward_query, result))) result = sorted(self.candidates, key=reward_query, reverse=True)[:self.num_select] _logger.info("Best candidate rewards: %s", list(map(reward_query, result))) return result diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/readme.md index b2f2697098..138a72f1a3 100644 --- a/examples/nas/spos/readme.md +++ b/examples/nas/spos/readme.md @@ -2,10 +2,45 @@ Single Path One-Shot by Megvii Research. +TODO: Reproduction results. + ## Preparation Need to download the flops lookup table from [here](https://1drv.ms/u/s!Am_mmG2-KsrnajesvSdfsq_cN48?e=aHVppN). -Put `op_flops_dict.pkl` under `data` directory. +Put `op_flops_dict.pkl` and `checkpoint-150000.pth.tar` (if you don't want to retrain the supernet) under `data` directory. + +Prepare ImageNet in the standard format (follow the script [here](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4)). Link it to `data/imagenet` will be more convenient. ## Step 1. Train Supernet +``` +python supernet.py +``` + +Will export the checkpoint to checkpoints directory, for the next step. + +## Step 2. Evolution Search + +To have a search space ready for NNI framework, first run + +``` +nnictl ss_gen -t "python tester.py" +``` + +This will generate a file called `nni_auto_gen_search_space.json`, which is a serialized representation of your search space. + +Then search with evolution tuner. + +``` +nnictl create --config config_search.yml +``` + +TODO: export final architecture from tuner is not ready yet. + +## Step 3. Train from Scratch + +``` +python scratch.py +``` + +It will automatically use `architecture_final.json`, which is already included in this repo. You can use any architecture you want with `--fixed-arc` option. diff --git a/src/sdk/pynni/nni/nas/pytorch/decision.py b/src/sdk/pynni/nni/nas/pytorch/decision.py index e4fb59e67d..48ef728945 100644 --- a/src/sdk/pynni/nni/nas/pytorch/decision.py +++ b/src/sdk/pynni/nni/nas/pytorch/decision.py @@ -1,3 +1,8 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +# Module under development. Not ready for use. + import logging import numpy as np From e63c3f3e86de874ce9ffd9d646c245fabf5d190e Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 13 Dec 2019 13:09:28 +0800 Subject: [PATCH 34/57] update --- examples/nas/spos/readme.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/readme.md index 138a72f1a3..3fe861f841 100644 --- a/examples/nas/spos/readme.md +++ b/examples/nas/spos/readme.md @@ -11,6 +11,8 @@ Put `op_flops_dict.pkl` and `checkpoint-150000.pth.tar` (if you don't want to re Prepare ImageNet in the standard format (follow the script [here](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4)). Link it to `data/imagenet` will be more convenient. +We don't support SPOS on CPU. You need to have at least one GPU to run the experiment. + ## Step 1. Train Supernet ``` From 048d604db52900f66637f31e38a0829957f317f5 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 13 Dec 2019 13:37:50 +0800 Subject: [PATCH 35/57] update --- examples/nas/spos/evolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py index 83a8aa68e9..4601b1d8fb 100644 --- a/examples/nas/spos/evolution.py +++ b/examples/nas/spos/evolution.py @@ -119,7 +119,7 @@ def _is_legal(self, cand): def _select_top_candidates(self): reward_query = lambda cand: self._reward_dict[self._hashcode(cand)] - _logger.info("All candidate rewards: %s", list(map(reward_query, result))) + _logger.info("All candidate rewards: %s", list(map(reward_query, self.candidates))) result = sorted(self.candidates, key=reward_query, reverse=True)[:self.num_select] _logger.info("Best candidate rewards: %s", list(map(reward_query, result))) return result From 45d0d7a223113ec31f8653958f766746f093d93a Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 14 Dec 2019 01:13:08 +0800 Subject: [PATCH 36/57] update --- examples/nas/spos/evolution.py | 15 +++++++++++---- examples/nas/spos/tester.py | 10 +++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py index 4601b1d8fb..28ae72d895 100644 --- a/examples/nas/spos/evolution.py +++ b/examples/nas/spos/evolution.py @@ -1,5 +1,6 @@ import json import logging +import re from collections import deque import numpy as np @@ -26,7 +27,7 @@ def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, self.epoch = 0 self.candidates = [] self.search_space = None - self.random_state = np.random.RandomState() + self.random_state = np.random.RandomState(0) # async status self._to_evaluate_queue = deque() @@ -64,7 +65,8 @@ def _random_candidate(self): return chosen_arch def _add_to_evaluate_queue(self, cand): - _logger.info("Generate candidate with flops %d, adding to eval queue.", self.model.get_candidate_flops(cand)) + _logger.info("Generate candidate %s with flops %d, adding to eval queue.", + self._get_architecture_repr(cand), self.model.get_candidate_flops(cand)) self._reward_dict[self._hashcode(cand)] = 0. self._to_evaluate_queue.append(cand) @@ -95,7 +97,7 @@ def _get_crossover(self, best): def _get_mutation(self, best): result = [] for _ in range(10 * self.num_mutation): - cand = best[self.random_state.randint(len(best))] + cand = best[self.random_state.randint(len(best))].copy() mutation_sample = np.random.random_sample(len(cand)) for s, k in zip(mutation_sample, cand): if s < self.m_prob: @@ -110,6 +112,10 @@ def _get_mutation(self, best): _logger.info("Found %d architectures with mutation.", len(result)) return result + def _get_architecture_repr(self, cand): + return re.sub(r"\".*?\": \{\"_idx\": (\d+), \"_value\": \".*?\"\}", r"\1", + self._hashcode(cand)) + def _is_legal(self, cand): if self._hashcode(cand) in self._reward_dict: return False @@ -134,11 +140,12 @@ def _bind_and_send_parameters(self, use_st_callback=False): parameter_id = self._sending_parameter_queue.popleft() parameters = self._to_evaluate_queue.popleft() self._id2candidate[parameter_id] = parameters + _logger.info("Send parameter [%d] %s.", parameter_id, self._get_architecture_repr(parameters)) result.append(parameters) self._pending_result_ids.add(parameter_id) if use_st_callback: self._st_callback(parameter_id, parameters) - _logger.info("Sending extra parameter with callback.") + _logger.info("Extra parameter with callback.") return result def generate_multiple_parameters(self, parameter_id_list, **kwargs): diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index c1e7fb02b9..5c9e1f7b79 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -20,11 +20,11 @@ def retrain_bn(model, criterion, max_iters, log_freq, loader): with torch.no_grad(): - # logger.info("Clear BN statistics...") - # for m in model.modules(): - # if isinstance(m, nn.BatchNorm2d): - # m.running_mean = torch.zeros_like(m.running_mean) - # m.running_var = torch.ones_like(m.running_var) + logger.info("Clear BN statistics...") + for m in model.modules(): + if isinstance(m, nn.BatchNorm2d): + m.running_mean = torch.zeros_like(m.running_mean) + m.running_var = torch.ones_like(m.running_var) logger.info("Train BN with training set (BN sanitize)...") model.train() From d498a385dcea4b0eee887622c75655ad9be23241 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Sat, 14 Dec 2019 10:12:38 +0800 Subject: [PATCH 37/57] update --- examples/nas/spos/evolution.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/examples/nas/spos/evolution.py b/examples/nas/spos/evolution.py index 28ae72d895..efc056c334 100644 --- a/examples/nas/spos/evolution.py +++ b/examples/nas/spos/evolution.py @@ -134,28 +134,25 @@ def _select_top_candidates(self): def _hashcode(d): return json.dumps(d, sort_keys=True) - def _bind_and_send_parameters(self, use_st_callback=False): + def _bind_and_send_parameters(self): result = [] while self._sending_parameter_queue and self._to_evaluate_queue: parameter_id = self._sending_parameter_queue.popleft() parameters = self._to_evaluate_queue.popleft() self._id2candidate[parameter_id] = parameters - _logger.info("Send parameter [%d] %s.", parameter_id, self._get_architecture_repr(parameters)) result.append(parameters) self._pending_result_ids.add(parameter_id) - if use_st_callback: - self._st_callback(parameter_id, parameters) - _logger.info("Extra parameter with callback.") + self._st_callback(parameter_id, parameters) + _logger.info("Send parameter [%d] %s.", parameter_id, self._get_architecture_repr(parameters)) return result def generate_multiple_parameters(self, parameter_id_list, **kwargs): - if "st_callback" in kwargs: + if "st_callback" in kwargs and self._st_callback is None: self._st_callback = kwargs["st_callback"] for parameter_id in parameter_id_list: self._sending_parameter_queue.append(parameter_id) - result = self._bind_and_send_parameters() - _logger.info("Requested %d parameters, %d sent.", len(parameter_id_list), len(result)) - return result + self._bind_and_send_parameters() + return [] # always not use this. might induce problem of over-sending def receive_trial_result(self, parameter_id, parameters, value, **kwargs): _logger.info("Candidate %d, reported reward %f", parameter_id, value) @@ -166,7 +163,5 @@ def trial_end(self, parameter_id, success, **kwargs): if not self._pending_result_ids and not self._to_evaluate_queue: # a new epoch now self._next_round() - if self._st_callback is not None: - self._bind_and_send_parameters(use_st_callback=True) - else: - _logger.warning("No send callback found.") + assert self._st_callback is not None + self._bind_and_send_parameters() From dbe86803b2d397e7dc36a46e9d99ef06b1008ef4 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 17 Dec 2019 16:31:57 +0800 Subject: [PATCH 38/57] update --- examples/nas/spos/config_search.yml | 4 +- examples/nas/spos/dataloader.py | 28 +++++--- examples/nas/spos/readme.md | 14 ++-- examples/nas/spos/scratch.py | 55 +++++++++------- examples/nas/spos/supernet.py | 23 +++++-- examples/nas/spos/tester.py | 25 ++++--- examples/nas/spos/tuner.py | 17 +++++ examples/nas/spos/utils.py | 65 ------------------- .../pynni/nni/nas/pytorch/spos/__init__.py | 1 + .../pynni/nni/nas/pytorch}/spos/evolution.py | 29 +++++---- src/sdk/pynni/nni/nas/pytorch/spos/trainer.py | 18 ++--- 11 files changed, 135 insertions(+), 144 deletions(-) create mode 100644 examples/nas/spos/tuner.py rename {examples/nas => src/sdk/pynni/nni/nas/pytorch}/spos/evolution.py (87%) diff --git a/examples/nas/spos/config_search.yml b/examples/nas/spos/config_search.yml index ca57a25930..2ba5373041 100644 --- a/examples/nas/spos/config_search.yml +++ b/examples/nas/spos/config_search.yml @@ -8,8 +8,8 @@ searchSpacePath: nni_auto_gen_search_space.json useAnnotation: false tuner: codeDir: . - classFileName: evolution.py - className: SPOSEvolution + classFileName: tuner.py + className: EvolutionWithFlops trial: # TODO: change the imagenet dir before release. command: python tester.py --imagenet-dir /data/ssd1/v-yugzh/imagenet --spos-prep diff --git a/examples/nas/spos/dataloader.py b/examples/nas/spos/dataloader.py index d2a6795ed8..75445fdf94 100644 --- a/examples/nas/spos/dataloader.py +++ b/examples/nas/spos/dataloader.py @@ -63,7 +63,23 @@ def define_graph(self): return [output, self.labels] -def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, crop=224, val_size=256, auto_reset=False, +class ClassificationWrapper: + def __init__(self, loader, size): + self.loader = loader + self.size = size + + def __iter__(self): + return self + + def __next__(self): + data = next(self.loader) + return data[0]["data"], data[0]["label"].view(-1).long().cuda(non_blocking=True) + + def __len__(self): + return self.size + + +def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, crop=224, val_size=256, spos_preprocessing=False, seed=12, shuffle=False, device_id=None): world_size, local_rank = 1, 0 if device_id is None: @@ -82,10 +98,6 @@ def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, crop=224, raise AssertionError pipeline.build() num_samples = pipeline.epoch_size("Reader") - return DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train", - auto_reset=auto_reset), \ - (num_samples + batch_size - 1) // batch_size - - -def convert_data_format_dali(data): - return data[0]["data"], data[0]["label"].view(-1).long().cuda(non_blocking=True) + print(num_samples) + return ClassificationWrapper(DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train", + auto_reset=True), (num_samples + batch_size - 1) // batch_size) diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/readme.md index 3fe861f841..31ee641bb7 100644 --- a/examples/nas/spos/readme.md +++ b/examples/nas/spos/readme.md @@ -1,6 +1,8 @@ -# Single Path One-Shot +# Single Path One-Shot Neural Architecture Search with Uniform Sampling -Single Path One-Shot by Megvii Research. +Single Path One-Shot by Megvii Research. [Paper link](https://arxiv.org/abs/1904.00420) [Official repo](https://github.com/megvii-model/SinglePathOneShot) + +Block search only. Channel search is not supported yet. TODO: Reproduction results. @@ -11,7 +13,7 @@ Put `op_flops_dict.pkl` and `checkpoint-150000.pth.tar` (if you don't want to re Prepare ImageNet in the standard format (follow the script [here](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4)). Link it to `data/imagenet` will be more convenient. -We don't support SPOS on CPU. You need to have at least one GPU to run the experiment. +We don't support SPOS on CPU. You need to have at least one GPU to run the experiment. This is mainly because NVIDIA DALI is used as a prerequisite to accelerate the data loading of ImageNet. ## Step 1. Train Supernet @@ -21,6 +23,8 @@ python supernet.py Will export the checkpoint to checkpoints directory, for the next step. +NOTE: The data loading used in the official repo is [slightly different from usual](https://github.com/megvii-model/SinglePathOneShot/issues/5). The option `--spos-preprocessing` will simulate the behavior used originally and enable you to use the checkpoints pretrained. + ## Step 2. Evolution Search To have a search space ready for NNI framework, first run @@ -37,7 +41,7 @@ Then search with evolution tuner. nnictl create --config config_search.yml ``` -TODO: export final architecture from tuner is not ready yet. +The final architecture exported from every epoch of evolution can be found in `checkpoints` under the working directory of your tuner, which, by default, is `$HOME/nni/experiments/$EXP_ID/log`. ## Step 3. Train from Scratch @@ -45,4 +49,4 @@ TODO: export final architecture from tuner is not ready yet. python scratch.py ``` -It will automatically use `architecture_final.json`, which is already included in this repo. You can use any architecture you want with `--fixed-arc` option. +It will automatically use `architecture_final.json`, which is already tracked here. You can use any architecture (e.g., the architecture found in step 2) with `--fixed-arc` option. diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 8891c233cb..4e6919ddf2 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -1,9 +1,11 @@ import argparse import logging +import random +import numpy as np import torch import torch.nn as nn -from dataloader import get_imagenet_iter_dali, convert_data_format_dali +from dataloader import get_imagenet_iter_dali from nni.nas.pytorch.fixed import apply_fixed_architecture from nni.nas.pytorch.utils import AverageMeterGroup from torch.utils.tensorboard import SummaryWriter @@ -14,14 +16,13 @@ logger = logging.getLogger("nni") -def train(epoch, model, criterion, optimizer, loader, num_iters, writer, args): +def train(epoch, model, criterion, optimizer, loader, writer, args): model.train() meters = AverageMeterGroup() cur_lr = optimizer.param_groups[0]["lr"] - for step, data in enumerate(loader): - cur_step = num_iters * epoch + step - x, y = convert_data_format_dali(data) + for step, (x, y) in enumerate(loader): + cur_step = len(loader) * epoch + step optimizer.zero_grad() logits = model(x) loss = criterion(logits, y) @@ -37,28 +38,27 @@ def train(epoch, model, criterion, optimizer, loader, num_iters, writer, args): writer.add_scalar("acc1/train", metrics["acc1"], global_step=cur_step) writer.add_scalar("acc5/train", metrics["acc5"], global_step=cur_step) - if step % args.log_frequency == 0 or step + 1 == num_iters: + if step % args.log_frequency == 0 or step + 1 == len(loader): logger.info("Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, - args.epochs, step + 1, num_iters, meters) + args.epochs, step + 1, len(loader), meters) logger.info("Epoch %d training summary: %s", epoch + 1, meters) -def validate(epoch, model, criterion, loader, num_iters, writer, args): +def validate(epoch, model, criterion, loader, writer, args): model.eval() meters = AverageMeterGroup() with torch.no_grad(): - for step, data in enumerate(loader): - x, y = convert_data_format_dali(data) + for step, (x, y) in enumerate(loader): logits = model(x) loss = criterion(logits, y) metrics = accuracy(logits, y) metrics["loss"] = loss.item() meters.update(metrics) - if step % args.log_frequency == 0 or step + 1 == num_iters: + if step % args.log_frequency == 0 or step + 1 == len(loader): logger.info("Epoch [%d/%d] Validation Step [%d/%d] %s", epoch + 1, - args.epochs, step + 1, num_iters, meters) + args.epochs, step + 1, len(loader), meters) writer.add_scalar("loss/test", meters.loss.avg, global_step=epoch) writer.add_scalar("acc1/test", meters.acc1.avg, global_step=epoch) @@ -80,9 +80,17 @@ def validate(epoch, model, criterion, loader, num_iters, writer, args): parser.add_argument("--weight-decay", type=float, default=4E-5) parser.add_argument("--label-smooth", type=float, default=0.1) parser.add_argument("--log-frequency", type=int, default=10) + parser.add_argument("--lr-decay", type=str, default="linear") + parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + np.random.seed(args.seed) + random.seed(args.seed) + torch.backends.cudnn.deterministic = True + model = ShuffleNetV2OneShot() model.cuda() apply_fixed_architecture(model, args.architecture) @@ -91,20 +99,23 @@ def validate(epoch, model, criterion, loader, num_iters, writer, args): criterion = CrossEntropyLabelSmooth(1000, 0.1) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, - lambda step: (1.0 - step / args.epochs) - if step <= args.epochs else 0, - last_epoch=-1) + if args.lr_decay == "linear": + scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, + lambda step: (1.0 - step / args.epochs) + if step <= args.epochs else 0, + last_epoch=-1) + elif args.lr_decay == "cosine": + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, 1E-3) + else: + raise ValueError("'%s' not supported." % args.lr_decay) writer = SummaryWriter(log_dir=args.tb_dir) - train_loader, train_iters = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers) - val_loader, val_iters = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers) + train_loader = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers) + val_loader = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers) for epoch in range(args.epochs): - train(epoch, model, criterion, optimizer, train_loader, train_iters, writer, args) - validate(epoch, model, criterion, val_loader, val_iters, writer, args) + train(epoch, model, criterion, optimizer, train_loader, writer, args) + validate(epoch, model, criterion, val_loader, writer, args) scheduler.step() - train_loader.reset() - val_loader.reset() writer.close() diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index 6d5160a088..f440631c97 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -1,13 +1,16 @@ import argparse +import random +import numpy as np import torch import torch.nn as nn from nni.nas.pytorch.callbacks import LRSchedulerCallback from nni.nas.pytorch.callbacks import ModelCheckpoint from nni.nas.pytorch.spos import SPOSSupernetTrainingMutator, SPOSSupernetTrainer +from dataloader import get_imagenet_iter_dali from network import ShuffleNetV2OneShot, load_and_parse_state_dict -from utils import get_imagenet, CrossEntropyLabelSmooth, accuracy +from utils import CrossEntropyLabelSmooth, accuracy if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") @@ -17,16 +20,23 @@ help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") parser.add_argument("--workers", type=int, default=4) - parser.add_argument("--batch-size", type=int, default=1024) + parser.add_argument("--batch-size", type=int, default=960) parser.add_argument("--epochs", type=int, default=120) parser.add_argument("--learning-rate", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--weight-decay", type=float, default=4E-5) parser.add_argument("--label-smooth", type=float, default=0.1) parser.add_argument("--log-frequency", type=int, default=10) + parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() - dataset_train, dataset_valid = get_imagenet(args.imagenet_dir, spos_pre=args.spos_preprocessing) + + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + np.random.seed(args.seed) + random.seed(args.seed) + torch.backends.cudnn.deterministic = True + model = ShuffleNetV2OneShot() if args.load_checkpoint: if not args.spos_preprocessing: @@ -43,11 +53,14 @@ lambda step: (1.0 - step / args.epochs) if step <= args.epochs else 0, last_epoch=-1) + train_loader = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers, + spos_preprocessing=args.spos_preprocessing) + valid_loader = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers, + spos_preprocessing=args.spos_preprocessing) trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, - args.epochs, dataset_train, dataset_valid, + args.epochs, train_loader, valid_loader, mutator=mutator, batch_size=args.batch_size, log_frequency=args.log_frequency, workers=args.workers, callbacks=[LRSchedulerCallback(scheduler), ModelCheckpoint("./checkpoints")]) trainer.train() - # trainer.validate() diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index 5c9e1f7b79..3f0be88c1a 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -11,7 +11,7 @@ from nni.nas.pytorch.classic_nas import get_and_apply_next_architecture from nni.nas.pytorch.utils import AverageMeterGroup -from dataloader import get_imagenet_iter_dali, convert_data_format_dali +from dataloader import get_imagenet_iter_dali from network import ShuffleNetV2OneShot, load_and_parse_state_dict from utils import CrossEntropyLabelSmooth, accuracy @@ -30,7 +30,7 @@ def retrain_bn(model, criterion, max_iters, log_freq, loader): model.train() meters = AverageMeterGroup() for step in range(max_iters): - inputs, targets = convert_data_format_dali(next(loader)) + inputs, targets = next(loader) logits = model(inputs) loss = criterion(logits, targets) metrics = accuracy(logits, targets) @@ -46,16 +46,15 @@ def test_acc(model, criterion, log_freq, loader): meters = AverageMeterGroup() start_time = time.time() with torch.no_grad(): - for step, data in enumerate(loader): - inputs, targets = convert_data_format_dali(data) + for step, (inputs, targets) in enumerate(loader): logits = model(inputs) loss = criterion(logits, targets) metrics = accuracy(logits, targets) metrics["loss"] = loss.item() meters.update(metrics) - if step % log_freq == 0: - logger.info("Valid Step [%d] time %.3fs acc1 %.4f acc5 %.4f loss %.4f", - step + 1, time.time() - start_time, + if step % log_freq == 0 or step + 1 == len(loader): + logger.info("Valid Step [%d/%d] time %.3fs acc1 %.4f acc5 %.4f loss %.4f", + step + 1, len(loader), time.time() - start_time, meters.acc1.avg, meters.acc5.avg, meters.loss.avg) return meters.acc1.avg @@ -102,12 +101,12 @@ def evaluate_acc(model, criterion, args, loader_train, loader_test): model.load_state_dict(load_and_parse_state_dict(filepath=args.checkpoint)) model.cuda() - train_loader, train_iters = get_imagenet_iter_dali("train", args.imagenet_dir, args.train_batch_size, args.workers, - auto_reset=True, spos_preprocessing=args.spos_preprocessing, - seed=args.seed, device_id=0) - val_loader, val_iters = get_imagenet_iter_dali("val", args.imagenet_dir, args.test_batch_size, args.workers, - spos_preprocessing=args.spos_preprocessing, shuffle=True, - seed=args.seed, device_id=0, auto_reset=True) + train_loader = get_imagenet_iter_dali("train", args.imagenet_dir, args.train_batch_size, args.workers, + spos_preprocessing=args.spos_preprocessing, + seed=args.seed, device_id=0) + val_loader = get_imagenet_iter_dali("val", args.imagenet_dir, args.test_batch_size, args.workers, + spos_preprocessing=args.spos_preprocessing, shuffle=True, + seed=args.seed, device_id=0) train_loader = cycle(train_loader) evaluate_acc(model, criterion, args, train_loader, val_loader) diff --git a/examples/nas/spos/tuner.py b/examples/nas/spos/tuner.py new file mode 100644 index 0000000000..df1861d613 --- /dev/null +++ b/examples/nas/spos/tuner.py @@ -0,0 +1,17 @@ +from nni.nas.pytorch.spos import SPOSEvolution + +from network import ShuffleNetV2OneShot + + +class EvolutionWithFlops(SPOSEvolution): + def __init__(self, flops_limit=330E6, **kwargs): + super().__init__(**kwargs) + self.model = ShuffleNetV2OneShot() + self.flops_limit = flops_limit + + def _is_legal(self, cand): + if not super()._is_legal(cand): + return False + if self.model.get_candidate_flops(cand) > self.flops_limit: + return False + return True diff --git a/examples/nas/spos/utils.py b/examples/nas/spos/utils.py index eaed7128e6..1e101697c2 100644 --- a/examples/nas/spos/utils.py +++ b/examples/nas/spos/utils.py @@ -1,70 +1,5 @@ -import os - -import numpy as np import torch import torch.nn as nn -from PIL import Image -from torchvision import transforms -from torchvision.datasets import ImageNet - -IMAGENET_MEAN = [0.485, 0.456, 0.406] -IMAGENET_STD = [0.229, 0.224, 0.225] - - -def spos_to_bgr_tensor(pic): - """Modified from `to_tensor`""" - if not isinstance(pic, Image.Image): - raise TypeError('pic should be PIL Image. Got {}'.format(type(pic))) - - if pic.mode == 'I': - img = torch.from_numpy(np.array(pic, np.int32, copy=False)) - elif pic.mode == 'I;16': - img = torch.from_numpy(np.array(pic, np.int16, copy=False)) - elif pic.mode == 'F': - img = torch.from_numpy(np.array(pic, np.float32, copy=False)) - elif pic.mode == '1': - img = 255 * torch.from_numpy(np.array(pic, np.uint8, copy=False)) - else: - img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) - if pic.mode == 'YCbCr': - nchannel = 3 - elif pic.mode == 'I;16': - nchannel = 1 - else: - nchannel = len(pic.mode) - img = img.view(pic.size[1], pic.size[0], nchannel) - # put it from HWC to CHW format - # yikes, this transpose takes 80% of the loading time/CPU - img = img[:, :, [2, 1, 0]].transpose(0, 1).transpose(0, 2).contiguous() - return img.float() if isinstance(img, torch.ByteTensor) else img - - -def get_imagenet(imagenet_root, spos_pre): - if not os.path.exists(imagenet_root): - raise FileNotFoundError("Imagenet root {} not exists. Pointing to the right directory with " - "command-line arguments.".format(imagenet_root)) - if spos_pre: - postprocess = [ - transforms.Lambda(lambda img: spos_to_bgr_tensor(img)) - ] - else: - postprocess = [ - transforms.ToTensor(), - transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) - ] - - train_transform = transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), - transforms.RandomHorizontalFlip(0.5), - ] + postprocess) - valid_transform = transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - ] + postprocess) - train_dataset = ImageNet(imagenet_root, split="train", transform=train_transform) - valid_dataset = ImageNet(imagenet_root, split="val", transform=valid_transform) - return train_dataset, valid_dataset class CrossEntropyLabelSmooth(nn.Module): diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py b/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py index dc43892384..ed432b0845 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +from .evolution import SPOSEvolution from .mutator import SPOSSupernetTrainingMutator from .trainer import SPOSSupernetTrainer diff --git a/examples/nas/spos/evolution.py b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py similarity index 87% rename from examples/nas/spos/evolution.py rename to src/sdk/pynni/nni/nas/pytorch/spos/evolution.py index efc056c334..3fac761419 100644 --- a/examples/nas/spos/evolution.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py @@ -1,20 +1,20 @@ import json import logging +import os import re from collections import deque import numpy as np from nni.tuner import Tuner -from network import ShuffleNetV2OneShot -_logger = logging.getLogger("nni") +_logger = logging.getLogger(__name__) class SPOSEvolution(Tuner): def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, - num_crossover=25, num_mutation=25, flops_limit=330E6): + num_crossover=25, num_mutation=25): assert num_population >= num_select self.max_epochs = max_epochs self.num_select = num_select @@ -22,8 +22,6 @@ def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, self.m_prob = m_prob self.num_crossover = num_crossover self.num_mutation = num_mutation - self.flops_limit = flops_limit - self.model = ShuffleNetV2OneShot() self.epoch = 0 self.candidates = [] self.search_space = None @@ -42,13 +40,15 @@ def update_search_space(self, search_space): self._next_round() def _next_round(self): - if self.epoch >= self.max_epochs: - return _logger.info("Epoch %d, generating...", self.epoch) if self.epoch == 0: self._get_random_population() + self.export_results(self.candidates) else: best_candidates = self._select_top_candidates() + self.export_results(best_candidates) + if self.epoch >= self.max_epochs: + return self.candidates = self._get_mutation(best_candidates) + self._get_crossover(best_candidates) self._get_random_population() self.epoch += 1 @@ -65,8 +65,7 @@ def _random_candidate(self): return chosen_arch def _add_to_evaluate_queue(self, cand): - _logger.info("Generate candidate %s with flops %d, adding to eval queue.", - self._get_architecture_repr(cand), self.model.get_candidate_flops(cand)) + _logger.info("Generate candidate %s, adding to eval queue.", self._get_architecture_repr(cand)) self._reward_dict[self._hashcode(cand)] = 0. self._to_evaluate_queue.append(cand) @@ -119,8 +118,6 @@ def _get_architecture_repr(self, cand): def _is_legal(self, cand): if self._hashcode(cand) in self._reward_dict: return False - if self.model.get_candidate_flops(cand) > self.flops_limit: - return False return True def _select_top_candidates(self): @@ -165,3 +162,13 @@ def trial_end(self, parameter_id, success, **kwargs): self._next_round() assert self._st_callback is not None self._bind_and_send_parameters() + + def export_results(self, result): + os.makedirs("checkpoints", exist_ok=True) + for i, cand in enumerate(result): + converted = dict() + for cand_key, cand_val in cand.items(): + onehot = [k == cand_val["_idx"] for k in range(len(self._search_space[cand_key]["_value"]))] + converted[cand_key] = onehot + with open(os.path.join("checkpoints", "%03d_%03d.json" % (self.epoch, i)), "w") as fp: + json.dump(converted, fp) diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py index 04086ec795..846acf7b08 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py @@ -14,28 +14,21 @@ class SPOSSupernetTrainer(Trainer): def __init__(self, model, loss, metrics, - optimizer, num_epochs, dataset_train, dataset_valid, + optimizer, num_epochs, train_loader, valid_loader, mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None): + assert torch.cuda.is_available() super().__init__(model, mutator if mutator is not None else SPOSSupernetTrainingMutator(model), - loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid, + loss, metrics, optimizer, num_epochs, None, None, batch_size, workers, device, log_frequency, callbacks) - self.train_loader = torch.utils.data.DataLoader(self.dataset_train, - batch_size=batch_size, - num_workers=workers, - shuffle=True) - self.valid_loader = torch.utils.data.DataLoader(self.dataset_valid, - batch_size=batch_size, - num_workers=workers, - shuffle=True) + self.train_loader = train_loader + self.valid_loader = valid_loader def train_one_epoch(self, epoch): self.model.train() meters = AverageMeterGroup() for step, (x, y) in enumerate(self.train_loader): - x, y = x.to(self.device), y.to(self.device) - self.optimizer.zero_grad() self.mutator.reset() logits = self.model(x) @@ -55,7 +48,6 @@ def validate_one_epoch(self, epoch): meters = AverageMeterGroup() with torch.no_grad(): for step, (x, y) in enumerate(self.valid_loader): - x, y = x.to(self.device), y.to(self.device) self.mutator.reset() logits = self.model(x) loss = self.loss(logits, y) From f4e893d94d3a81abfb26a1feb47018673c9fcdd0 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 17 Dec 2019 16:35:33 +0800 Subject: [PATCH 39/57] update --- examples/nas/spos/dataloader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/nas/spos/dataloader.py b/examples/nas/spos/dataloader.py index 75445fdf94..14616ba9d9 100644 --- a/examples/nas/spos/dataloader.py +++ b/examples/nas/spos/dataloader.py @@ -98,6 +98,5 @@ def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, crop=224, raise AssertionError pipeline.build() num_samples = pipeline.epoch_size("Reader") - print(num_samples) return ClassificationWrapper(DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train", auto_reset=True), (num_samples + batch_size - 1) // batch_size) From c24322aa52af6ab28808a716c2e8916b4448b401 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 17 Dec 2019 13:31:25 +0000 Subject: [PATCH 40/57] update --- examples/nas/spos/scratch.py | 2 +- examples/nas/spos/supernet.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 4e6919ddf2..64efe618f6 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -68,7 +68,7 @@ def validate(epoch, model, criterion, loader, writer, args): if __name__ == "__main__": - parser = argparse.ArgumentParser("SPOS Supernet Training") + parser = argparse.ArgumentParser("SPOS Training From Scratch") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") parser.add_argument("--tb-dir", type=str, default="runs") parser.add_argument("--architecture", type=str, default="architecture_final.json") diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index f440631c97..6bd2190a55 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -20,7 +20,7 @@ help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") parser.add_argument("--workers", type=int, default=4) - parser.add_argument("--batch-size", type=int, default=960) + parser.add_argument("--batch-size", type=int, default=896) parser.add_argument("--epochs", type=int, default=120) parser.add_argument("--learning-rate", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) @@ -43,7 +43,8 @@ print("You might want to use SPOS preprocessing if you are loading their checkpoints.") model.load_state_dict(load_and_parse_state_dict()) model.cuda() - model = nn.DataParallel(model) + if torch.cuda.device_count() > 1: # exclude last gpu, saving for data preprocessing on gpu + model = nn.DataParallel(model, device_ids=list(range(0, torch.cuda.device_count() - 1))) mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, flops_lb=290E6, flops_ub=360E6) criterion = CrossEntropyLabelSmooth(1000, 0.1) From 138764eb4dc8806e2b32f45afacc7c17a24679c1 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Tue, 17 Dec 2019 14:11:07 +0000 Subject: [PATCH 41/57] updaste --- examples/nas/spos/supernet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index 6bd2190a55..e1df20e28e 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -20,7 +20,7 @@ help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") parser.add_argument("--workers", type=int, default=4) - parser.add_argument("--batch-size", type=int, default=896) + parser.add_argument("--batch-size", type=int, default=840) parser.add_argument("--epochs", type=int, default=120) parser.add_argument("--learning-rate", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) From 7fb280ab827350931cd3e09f1adb0c89b8acb812 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 18 Dec 2019 07:18:35 +0000 Subject: [PATCH 42/57] fix pylint --- src/sdk/pynni/nni/nas/pytorch/spos/evolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py index 3fac761419..69445f7424 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py @@ -164,7 +164,7 @@ def trial_end(self, parameter_id, success, **kwargs): self._bind_and_send_parameters() def export_results(self, result): - os.makedirs("checkpoints", exist_ok=True) + os.makedirs("checkpoints", exist_ok=True) for i, cand in enumerate(result): converted = dict() for cand_key, cand_val in cand.items(): From 001c581def2ea25b666ed959f9f5403eb42ce286 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Wed, 18 Dec 2019 13:36:38 +0000 Subject: [PATCH 43/57] update batch size --- examples/nas/spos/supernet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index e1df20e28e..ad219db078 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -20,7 +20,7 @@ help="When true, image values will range from 0 to 255 and use BGR " "(as in original repo).") parser.add_argument("--workers", type=int, default=4) - parser.add_argument("--batch-size", type=int, default=840) + parser.add_argument("--batch-size", type=int, default=768) parser.add_argument("--epochs", type=int, default=120) parser.add_argument("--learning-rate", type=float, default=0.5) parser.add_argument("--momentum", type=float, default=0.9) From 5a00af5091e02d02ec3c2d22928b7e4b7e572f83 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 23 Dec 2019 11:26:33 +0800 Subject: [PATCH 44/57] update --- examples/nas/spos/scratch.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 64efe618f6..0681377ac4 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -82,6 +82,7 @@ def validate(epoch, model, criterion, loader, writer, args): parser.add_argument("--log-frequency", type=int, default=10) parser.add_argument("--lr-decay", type=str, default="linear") parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--spos-preprocessing", default=False, action="store_true") args = parser.parse_args() @@ -110,8 +111,10 @@ def validate(epoch, model, criterion, loader, writer, args): raise ValueError("'%s' not supported." % args.lr_decay) writer = SummaryWriter(log_dir=args.tb_dir) - train_loader = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers) - val_loader = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers) + train_loader = get_imagenet_iter_dali("train", args.imagenet_dir, args.batch_size, args.workers, + spos_preprocessing=args.spos_preprocessing) + val_loader = get_imagenet_iter_dali("val", args.imagenet_dir, args.batch_size, args.workers, + spos_preprocessing=args.spos_preprocessing) for epoch in range(args.epochs): train(epoch, model, criterion, optimizer, train_loader, writer, args) From 4cef622ace57901a4106a2b282c5a68b7bcefe58 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Mon, 23 Dec 2019 16:50:11 +0800 Subject: [PATCH 45/57] add evolution doc --- examples/nas/spos/readme.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/readme.md index 31ee641bb7..985277b84c 100644 --- a/examples/nas/spos/readme.md +++ b/examples/nas/spos/readme.md @@ -27,6 +27,8 @@ NOTE: The data loading used in the official repo is [slightly different from usu ## Step 2. Evolution Search +Single Path One-Shot leverages evolution algorithm to search for the best architecture. The tester, which is responsible for testing the sampled architecture, recalculates all the batch norm for a subset of training images, and evaluates the architecture on the full validation set. + To have a search space ready for NNI framework, first run ``` From ca47a5b958d4a34974b7c169f420b828dce684d6 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Mon, 23 Dec 2019 17:05:15 +0800 Subject: [PATCH 46/57] remove decision --- src/sdk/pynni/nni/nas/pytorch/decision.py | 158 ---------------------- 1 file changed, 158 deletions(-) delete mode 100644 src/sdk/pynni/nni/nas/pytorch/decision.py diff --git a/src/sdk/pynni/nni/nas/pytorch/decision.py b/src/sdk/pynni/nni/nas/pytorch/decision.py deleted file mode 100644 index 48ef728945..0000000000 --- a/src/sdk/pynni/nni/nas/pytorch/decision.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -# Module under development. Not ready for use. - -import logging - -import numpy as np -import torch - -_logger = logging.getLogger(__name__) - - -class Decision: - def __init__(self): - raise NotImplementedError("You should never use init to initialize a general decision.") - - @classmethod - def from_nni_protocol_format(cls, candidate, search_space=None): - assert "_idx" in candidate and "_val" in candidate, "A candidate must have '_idx' and '_val' in its fields." - assert type(candidate["_idx"]) == type(candidate["_val"]), "Indices and values must have the same type." - if search_space is not None: - search_space_values = search_space["_values"] - if isinstance(candidate["_idx"], list): - assert len(candidate["_idx"]) == len(candidate["_val"]), \ - "Number of indices must be equal of number of values." - for idx, val in zip(candidate["_idx"], candidate["_val"]): - assert 0 <= idx < len(search_space_values) and search_space_values[idx] == val, \ - "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_values, val) - elif isinstance(candidate["_idx"], int): - idx, val = candidate["_idx"], candidate["_val"] - assert 0 <= idx < len(search_space_values) and search_space_values[idx] == val, \ - "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_values, val) - else: - raise ValueError("Index of unrecognized type: {}".format(candidate["_idx"])) - return cls.from_indices(candidate["_idx"], len(search_space_values)) - return cls.from_indices(candidate["_idx"]) - - @classmethod - def from_indices(cls, indices, n_candidates=None): - """ - Construct a decision from indices. - - Parameters - ---------- - indices : int or list of int - n_candidates : int - - Returns - ------- - RelaxedDecision - """ - return RelaxedDecision(indices, n_candidates) - - @classmethod - def deserialize(cls, obj): - if obj is None: - return EmptyDecision() - if isinstance(obj, dict) and "_idx" in obj: - return cls.from_nni_protocol_format(obj) - if isinstance(obj, int): - return cls.from_indices(obj) - obj_type = cls._list_type(obj) - if obj_type == int: - # list of indices - return cls.from_indices(obj) - if obj_type == float: - # list of weights - return ContinuousDecision(obj) - if obj_type == bool: - # one/multi-hot tensor - return RelaxedDecision.from_multi_hot_iterable(obj) - - @staticmethod - def _list_type(lst): - # get the element type of a list / tensor - - def _print_all_01_warning(): - if all_01: - _logger.warning("All elements in %s are 0 and 1, but type is not bool.", lst) - - all_01 = all(map(lambda x: x in [0., 1.], lst)) - if torch.is_tensor(lst): - type_lower = lst.type().lower() - if "bool" in type_lower: - return bool - _print_all_01_warning() - if "float" in type_lower: - return float - raise ValueError("Unsupported tensor type: {}".format(type_lower)) - if all(map(lambda x: isinstance(x, bool), lst)): - return bool - _print_all_01_warning() - for t in (int, float): - if all(map(lambda x: isinstance(x, t), lst)): - return t - - def serialize(self): - raise NotImplementedError - - -class EmptyDecision(Decision): - def serialize(self): - return None - - def __iter__(self): - raise StopIteration - - -class RelaxedDecision(Decision): - def __init__(self, indices, n_candidates=None): - if isinstance(indices, int): - self.indices = [indices] - elif isinstance(indices, list): - self.indices = indices - assert len(set(self.indices)) == len(self.indices), "Indices must be unique" - self.n_candidates = n_candidates - if n_candidates is not None: - assert all(map(lambda x: 0 <= x < n_candidates, self.indices)), \ - "Indices must be in range [0, n_candidates)." - - @classmethod - def from_multi_hot_iterable(cls, iterable): - indices, total = [], 0 - for i, t in enumerate(iterable): - if t: - indices.append(i) - total += 1 - return cls(indices, total) - - def serialize(self): - if len(self.indices) == 1: - return self.index - return self.indices - - @property - def index(self): - if len(self.indices) > 1: - raise ValueError("More than one indices. Index doesn't work.") - return self.indices[0] - - def __iter__(self): - return iter(self.indices) - - -class ContinuousDecision: - def __init__(self, weights): - self.weights = weights - - def serialize(self): - if torch.is_tensor(self.weights): - return self.weights.detach().numpy().tolist() - if isinstance(self.weights, np.ndarray): - return self.weights.tolist() - return self.weights - - def __iter__(self): - return iter(self.weights) From 5c2fbd2de61832d47b8a1f545235fc44cfce7550 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Mon, 23 Dec 2019 17:21:08 +0800 Subject: [PATCH 47/57] add docstring --- .../pynni/nni/nas/pytorch/spos/evolution.py | 44 +++++++++++++++++++ src/sdk/pynni/nni/nas/pytorch/spos/mutator.py | 29 ++++++++++++ src/sdk/pynni/nni/nas/pytorch/spos/trainer.py | 4 ++ 3 files changed, 77 insertions(+) diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py index 69445f7424..937964a85d 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py @@ -15,6 +15,26 @@ class SPOSEvolution(Tuner): def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, num_crossover=25, num_mutation=25): + """ + Initialize SPOS Evolution Tuner. + + Parameters + ---------- + max_epochs : int + Maximum number of epochs to run. + num_select : int + Number of survival candidates of each epoch. + num_population : int + Number of candidates at the start of each epoch. If candidates generated by + crossover and mutation are not enough, the rest will be filled with random + candidates. + m_prob : float + The probability of mutation. + num_crossover : int + Number of candidates generated by crossover in each epoch. + num_mutation : int + Number of candidates generated by mutation in each epoch. + """ assert num_population >= num_select self.max_epochs = max_epochs self.num_select = num_select @@ -36,6 +56,9 @@ def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1, self._st_callback = None def update_search_space(self, search_space): + """ + Handle the initialization/update event of search space. + """ self._search_space = search_space self._next_round() @@ -132,6 +155,10 @@ def _hashcode(d): return json.dumps(d, sort_keys=True) def _bind_and_send_parameters(self): + """ + There are two types of resources: parameter ids and candidates. This function is called at + necessary times to bind these resources to send new trials with st_callback. + """ result = [] while self._sending_parameter_queue and self._to_evaluate_queue: parameter_id = self._sending_parameter_queue.popleft() @@ -144,6 +171,10 @@ def _bind_and_send_parameters(self): return result def generate_multiple_parameters(self, parameter_id_list, **kwargs): + """ + Callback function necessary to implement a tuner. This will put more parameter ids into the + parameter id queue. + """ if "st_callback" in kwargs and self._st_callback is None: self._st_callback = kwargs["st_callback"] for parameter_id in parameter_id_list: @@ -152,10 +183,16 @@ def generate_multiple_parameters(self, parameter_id_list, **kwargs): return [] # always not use this. might induce problem of over-sending def receive_trial_result(self, parameter_id, parameters, value, **kwargs): + """ + Callback function. Receive a trial result. + """ _logger.info("Candidate %d, reported reward %f", parameter_id, value) self._reward_dict[self._hashcode(self._id2candidate[parameter_id])] = value def trial_end(self, parameter_id, success, **kwargs): + """ + Callback function when a trial is ended and resource is released. + """ self._pending_result_ids.remove(parameter_id) if not self._pending_result_ids and not self._to_evaluate_queue: # a new epoch now @@ -164,6 +201,13 @@ def trial_end(self, parameter_id, success, **kwargs): self._bind_and_send_parameters() def export_results(self, result): + """ + Export a number of candidates to `checkpoints dir. + + Parameters + ---------- + result : dict + """ os.makedirs("checkpoints", exist_ok=True) for i, cand in enumerate(result): converted = dict() diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py index 7345cb7636..88a01eeeaf 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py @@ -12,6 +12,24 @@ class SPOSSupernetTrainingMutator(RandomMutator): def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None, flops_bin_num=7, flops_sample_timeout=500): + """ + + Parameters + ---------- + model : nn.Module + flops_func : callable + Callable that takes a candidate from `sample_search` and returns its candidate. When `flops_func` + is None, functions related to flops will be deactivated. + flops_lb : number + Lower bound of flops. + flops_ub : number + Upper bound of flops. + flops_bin_num : number + Number of bins divided for the interval of flops to ensure the uniformity. Bigger number will be more + uniform, but the sampling will be slower. + flops_sample_timeout : int + Maximum number of attempts to sample before giving up and use a random candidate. + """ super().__init__(model) self._flops_func = flops_func if self._flops_func is not None: @@ -20,6 +38,14 @@ def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None, self._flops_sample_timeout = flops_sample_timeout def sample_search(self): + """ + Sample a candidate for training. When `flops_func` is not None, candidates will be sampled uniformly + relative to flops. + + Returns + ------- + dict + """ if self._flops_func is not None: for times in range(self._flops_sample_timeout): idx = np.random.randint(self._flops_bin_num) @@ -31,4 +57,7 @@ def sample_search(self): return super().sample_search() def sample_final(self): + """ + Implement only to suffice the interface of Mutator. + """ return self.sample_search() diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py index 846acf7b08..ab23760bf9 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py @@ -13,6 +13,10 @@ class SPOSSupernetTrainer(Trainer): + """ + This trainer trains a supernet that can be used for evolution search. + """ + def __init__(self, model, loss, metrics, optimizer, num_epochs, train_loader, valid_loader, mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, From e8d67ca577cb5972d0a8590b0e0573aec7b6d2dd Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Mon, 23 Dec 2019 17:44:43 +0800 Subject: [PATCH 48/57] add docstring --- examples/nas/spos/tuner.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/nas/spos/tuner.py b/examples/nas/spos/tuner.py index df1861d613..13d591c969 100644 --- a/examples/nas/spos/tuner.py +++ b/examples/nas/spos/tuner.py @@ -4,6 +4,11 @@ class EvolutionWithFlops(SPOSEvolution): + """ + This tuner extends the function of evolution tuner, by limiting the flops generated by tuner. + Needs a function to examine the flops. + """ + def __init__(self, flops_limit=330E6, **kwargs): super().__init__(**kwargs) self.model = ShuffleNetV2OneShot() From 18bd1849b5ae0a99e4d582b0829b024f3f82e330 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Mon, 23 Dec 2019 18:39:42 +0800 Subject: [PATCH 49/57] improve docs --- examples/nas/spos/readme.md | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/readme.md index 985277b84c..8d658f640b 100644 --- a/examples/nas/spos/readme.md +++ b/examples/nas/spos/readme.md @@ -1,19 +1,49 @@ # Single Path One-Shot Neural Architecture Search with Uniform Sampling -Single Path One-Shot by Megvii Research. [Paper link](https://arxiv.org/abs/1904.00420) [Official repo](https://github.com/megvii-model/SinglePathOneShot) +Single Path One-Shot by Megvii Research. [Paper link](https://arxiv.org/abs/1904.00420). [Official repo](https://github.com/megvii-model/SinglePathOneShot). Block search only. Channel search is not supported yet. +Only GPU version is provided here. + TODO: Reproduction results. ## Preparation +### Requirements + +* PyTorch >= 1.2 +* NVIDIA DALI >= 0.16 as we use DALI to accelerate the data loading of ImageNet. + +### Data + Need to download the flops lookup table from [here](https://1drv.ms/u/s!Am_mmG2-KsrnajesvSdfsq_cN48?e=aHVppN). Put `op_flops_dict.pkl` and `checkpoint-150000.pth.tar` (if you don't want to retrain the supernet) under `data` directory. Prepare ImageNet in the standard format (follow the script [here](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4)). Link it to `data/imagenet` will be more convenient. -We don't support SPOS on CPU. You need to have at least one GPU to run the experiment. This is mainly because NVIDIA DALI is used as a prerequisite to accelerate the data loading of ImageNet. +After preparation, it's expected to have the following code structure: + +``` +spos +├── architecture_final.json +├── blocks.py +├── config_search.yml +├── data +│   ├── imagenet +│   │   ├── train +│   │   └── val +│   └── op_flops_dict.pkl +├── dataloader.py +├── network.py +├── nni_auto_gen_search_space.json +├── readme.md +├── scratch.py +├── supernet.py +├── tester.py +├── tuner.py +└── utils.py +``` ## Step 1. Train Supernet @@ -23,7 +53,7 @@ python supernet.py Will export the checkpoint to checkpoints directory, for the next step. -NOTE: The data loading used in the official repo is [slightly different from usual](https://github.com/megvii-model/SinglePathOneShot/issues/5). The option `--spos-preprocessing` will simulate the behavior used originally and enable you to use the checkpoints pretrained. +NOTE: The data loading used in the official repo is [slightly different from usual](https://github.com/megvii-model/SinglePathOneShot/issues/5), as they use BGR tensor and keep the values between 0 and 255 intentionally to align with their own DL framework. The option `--spos-preprocessing` will simulate the behavior used originally and enable you to use the checkpoints pretrained. ## Step 2. Evolution Search From 489c6de0f88deb16180bdcdf4084446af11fd5c4 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Mon, 23 Dec 2019 19:31:20 +0800 Subject: [PATCH 50/57] improve architecture readability --- examples/nas/spos/architecture_final.json | 140 ++++------------------ 1 file changed, 20 insertions(+), 120 deletions(-) diff --git a/examples/nas/spos/architecture_final.json b/examples/nas/spos/architecture_final.json index 66d07d9106..512a73b9d6 100644 --- a/examples/nas/spos/architecture_final.json +++ b/examples/nas/spos/architecture_final.json @@ -1,122 +1,22 @@ { - "LayerChoice1": [ - false, - false, - true, - false - ], - "LayerChoice10": [ - true, - false, - false, - false - ], - "LayerChoice11": [ - false, - false, - true, - false - ], - "LayerChoice12": [ - false, - false, - false, - true - ], - "LayerChoice13": [ - true, - false, - false, - false - ], - "LayerChoice14": [ - true, - false, - false, - false - ], - "LayerChoice15": [ - true, - false, - false, - false - ], - "LayerChoice16": [ - true, - false, - false, - false - ], - "LayerChoice17": [ - false, - false, - false, - true - ], - "LayerChoice18": [ - false, - false, - true, - false - ], - "LayerChoice19": [ - false, - false, - false, - true - ], - "LayerChoice2": [ - false, - true, - false, - false - ], - "LayerChoice20": [ - false, - false, - false, - true - ], - "LayerChoice3": [ - true, - false, - false, - false - ], - "LayerChoice4": [ - false, - true, - false, - false - ], - "LayerChoice5": [ - false, - false, - true, - false - ], - "LayerChoice6": [ - true, - false, - false, - false - ], - "LayerChoice7": [ - false, - false, - true, - false - ], - "LayerChoice8": [ - true, - false, - false, - false - ], - "LayerChoice9": [ - false, - false, - true, - false - ] + "LayerChoice1": [false, false, true, false], + "LayerChoice2": [false, true, false, false], + "LayerChoice3": [true, false, false, false], + "LayerChoice4": [false, true, false, false], + "LayerChoice5": [false, false, true, false], + "LayerChoice6": [true, false, false, false], + "LayerChoice7": [false, false, true, false], + "LayerChoice8": [true, false, false, false], + "LayerChoice9": [false, false, true, false], + "LayerChoice10": [true, false, false, false], + "LayerChoice11": [false, false, true, false], + "LayerChoice12": [false, false, false, true], + "LayerChoice13": [true, false, false, false], + "LayerChoice14": [true, false, false, false], + "LayerChoice15": [true, false, false, false], + "LayerChoice16": [true, false, false, false], + "LayerChoice17": [false, false, false, true], + "LayerChoice18": [false, false, true, false], + "LayerChoice19": [false, false, false, true], + "LayerChoice20": [false, false, false, true] } From 752c7d397bb619ce4ee950799494aa4043d21d4f Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Mon, 23 Dec 2019 19:32:52 +0800 Subject: [PATCH 51/57] add note for provided archit --- examples/nas/spos/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/readme.md index 8d658f640b..84542fdcc0 100644 --- a/examples/nas/spos/readme.md +++ b/examples/nas/spos/readme.md @@ -81,4 +81,4 @@ The final architecture exported from every epoch of evolution can be found in `c python scratch.py ``` -It will automatically use `architecture_final.json`, which is already tracked here. You can use any architecture (e.g., the architecture found in step 2) with `--fixed-arc` option. +By default, it will use `architecture_final.json`. This architecture is provided by the official repo (converted into NNI format). You can use any architecture (e.g., the architecture found in step 2) with `--fixed-arc` option. From a60e8e559a202a7a7dfb7571434635ef94020702 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 24 Dec 2019 10:36:20 +0800 Subject: [PATCH 52/57] add license --- examples/nas/spos/{readme.md => README.md} | 1 - examples/nas/spos/blocks.py | 3 +++ examples/nas/spos/config_search.yml | 3 +-- examples/nas/spos/dataloader.py | 8 ++++++-- examples/nas/spos/network.py | 8 ++++++-- examples/nas/spos/scratch.py | 3 +++ examples/nas/spos/supernet.py | 3 +++ examples/nas/spos/tester.py | 3 +++ examples/nas/spos/tuner.py | 3 +++ examples/nas/spos/utils.py | 3 +++ src/sdk/pynni/nni/nas/pytorch/spos/evolution.py | 3 +++ 11 files changed, 34 insertions(+), 7 deletions(-) rename examples/nas/spos/{readme.md => README.md} (98%) diff --git a/examples/nas/spos/readme.md b/examples/nas/spos/README.md similarity index 98% rename from examples/nas/spos/readme.md rename to examples/nas/spos/README.md index 84542fdcc0..928b58d2da 100644 --- a/examples/nas/spos/readme.md +++ b/examples/nas/spos/README.md @@ -36,7 +36,6 @@ spos │   └── op_flops_dict.pkl ├── dataloader.py ├── network.py -├── nni_auto_gen_search_space.json ├── readme.md ├── scratch.py ├── supernet.py diff --git a/examples/nas/spos/blocks.py b/examples/nas/spos/blocks.py index acc9f996ab..5908ecf077 100644 --- a/examples/nas/spos/blocks.py +++ b/examples/nas/spos/blocks.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import torch import torch.nn as nn diff --git a/examples/nas/spos/config_search.yml b/examples/nas/spos/config_search.yml index 2ba5373041..fe27faefc8 100644 --- a/examples/nas/spos/config_search.yml +++ b/examples/nas/spos/config_search.yml @@ -11,7 +11,6 @@ tuner: classFileName: tuner.py className: EvolutionWithFlops trial: - # TODO: change the imagenet dir before release. - command: python tester.py --imagenet-dir /data/ssd1/v-yugzh/imagenet --spos-prep + command: python tester.py --imagenet-dir /path/to/your/imagenet --spos-prep codeDir: . gpuNum: 1 diff --git a/examples/nas/spos/dataloader.py b/examples/nas/spos/dataloader.py index 14616ba9d9..198d637ed1 100644 --- a/examples/nas/spos/dataloader.py +++ b/examples/nas/spos/dataloader.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import os import nvidia.dali.ops as ops @@ -98,5 +101,6 @@ def get_imagenet_iter_dali(split, image_dir, batch_size, num_threads, crop=224, raise AssertionError pipeline.build() num_samples = pipeline.epoch_size("Reader") - return ClassificationWrapper(DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train", - auto_reset=True), (num_samples + batch_size - 1) // batch_size) + return ClassificationWrapper( + DALIClassificationIterator(pipeline, size=num_samples, fill_last_batch=split == "train", + auto_reset=True), (num_samples + batch_size - 1) // batch_size) diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 9158513f4a..ba45095775 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import os import pickle import re @@ -17,11 +20,12 @@ class ShuffleNetV2OneShot(nn.Module): 'xception_3x3', ] - def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=1024, n_classes=1000): + def __init__(self, input_size=224, first_conv_channels=16, last_conv_channels=1024, n_classes=1000, + op_flops_path="./data/op_flops_dict.pkl"): super().__init__() assert input_size % 32 == 0 - with open(os.path.join(os.path.dirname(__file__), "./data/op_flops_dict.pkl"), "rb") as fp: + with open(os.path.join(os.path.dirname(__file__), op_flops_path), "rb") as fp: self._op_flops_dict = pickle.load(fp) self.stage_blocks = [4, 4, 8, 4] diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index 0681377ac4..e2cffa6a1b 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import argparse import logging import random diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index ad219db078..0a1c372a49 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import argparse import random diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index 3f0be88c1a..01131ada40 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import argparse import logging import random diff --git a/examples/nas/spos/tuner.py b/examples/nas/spos/tuner.py index 13d591c969..fb8b9f2aa4 100644 --- a/examples/nas/spos/tuner.py +++ b/examples/nas/spos/tuner.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from nni.nas.pytorch.spos import SPOSEvolution from network import ShuffleNetV2OneShot diff --git a/examples/nas/spos/utils.py b/examples/nas/spos/utils.py index 1e101697c2..70ad98b55f 100644 --- a/examples/nas/spos/utils.py +++ b/examples/nas/spos/utils.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import torch import torch.nn as nn diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py index 937964a85d..a34da5f230 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import json import logging import os From 63271abbe47c94196252dd4edb48c6e5406a0c8d Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 24 Dec 2019 10:47:23 +0800 Subject: [PATCH 53/57] update --- examples/nas/spos/README.md | 2 +- examples/nas/spos/scratch.py | 5 +++-- examples/nas/spos/supernet.py | 8 ++++++-- examples/nas/spos/tester.py | 2 +- src/sdk/pynni/nni/nas/pytorch/spos/evolution.py | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/nas/spos/README.md b/examples/nas/spos/README.md index 928b58d2da..2fbc34590a 100644 --- a/examples/nas/spos/README.md +++ b/examples/nas/spos/README.md @@ -72,7 +72,7 @@ Then search with evolution tuner. nnictl create --config config_search.yml ``` -The final architecture exported from every epoch of evolution can be found in `checkpoints` under the working directory of your tuner, which, by default, is `$HOME/nni/experiments/$EXP_ID/log`. +The final architecture exported from every epoch of evolution can be found in `checkpoints` under the working directory of your tuner, which, by default, is `$HOME/nni/experiments/your_experiment_id/log`. ## Step 3. Train from Scratch diff --git a/examples/nas/spos/scratch.py b/examples/nas/spos/scratch.py index e2cffa6a1b..3a944a7909 100644 --- a/examples/nas/spos/scratch.py +++ b/examples/nas/spos/scratch.py @@ -16,7 +16,7 @@ from network import ShuffleNetV2OneShot from utils import CrossEntropyLabelSmooth, accuracy -logger = logging.getLogger("nni") +logger = logging.getLogger("nni.spos.scratch") def train(epoch, model, criterion, optimizer, loader, writer, args): @@ -86,6 +86,7 @@ def validate(epoch, model, criterion, loader, writer, args): parser.add_argument("--lr-decay", type=str, default="linear") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--spos-preprocessing", default=False, action="store_true") + parser.add_argument("--label-smoothing", type=float, default=0.1) args = parser.parse_args() @@ -100,7 +101,7 @@ def validate(epoch, model, criterion, loader, writer, args): apply_fixed_architecture(model, args.architecture) if torch.cuda.device_count() > 1: # exclude last gpu, saving for data preprocessing on gpu model = nn.DataParallel(model, device_ids=list(range(0, torch.cuda.device_count() - 1))) - criterion = CrossEntropyLabelSmooth(1000, 0.1) + criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) if args.lr_decay == "linear": diff --git a/examples/nas/spos/supernet.py b/examples/nas/spos/supernet.py index 0a1c372a49..3ab717868c 100644 --- a/examples/nas/spos/supernet.py +++ b/examples/nas/spos/supernet.py @@ -2,6 +2,7 @@ # Licensed under the MIT license. import argparse +import logging import random import numpy as np @@ -15,6 +16,8 @@ from network import ShuffleNetV2OneShot, load_and_parse_state_dict from utils import CrossEntropyLabelSmooth, accuracy +logger = logging.getLogger("nni.spos.supernet") + if __name__ == "__main__": parser = argparse.ArgumentParser("SPOS Supernet Training") parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet") @@ -31,6 +34,7 @@ parser.add_argument("--label-smooth", type=float, default=0.1) parser.add_argument("--log-frequency", type=int, default=10) parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--label-smoothing", type=float, default=0.1) args = parser.parse_args() @@ -43,14 +47,14 @@ model = ShuffleNetV2OneShot() if args.load_checkpoint: if not args.spos_preprocessing: - print("You might want to use SPOS preprocessing if you are loading their checkpoints.") + logger.warning("You might want to use SPOS preprocessing if you are loading their checkpoints.") model.load_state_dict(load_and_parse_state_dict()) model.cuda() if torch.cuda.device_count() > 1: # exclude last gpu, saving for data preprocessing on gpu model = nn.DataParallel(model, device_ids=list(range(0, torch.cuda.device_count() - 1))) mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, flops_lb=290E6, flops_ub=360E6) - criterion = CrossEntropyLabelSmooth(1000, 0.1) + criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, diff --git a/examples/nas/spos/tester.py b/examples/nas/spos/tester.py index 01131ada40..b31b8f2fab 100644 --- a/examples/nas/spos/tester.py +++ b/examples/nas/spos/tester.py @@ -18,7 +18,7 @@ from network import ShuffleNetV2OneShot, load_and_parse_state_dict from utils import CrossEntropyLabelSmooth, accuracy -logger = logging.getLogger("nni") +logger = logging.getLogger("nni.spos.tester") def retrain_bn(model, criterion, max_iters, log_freq, loader): diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py index a34da5f230..34d6246eb2 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py @@ -205,7 +205,7 @@ def trial_end(self, parameter_id, success, **kwargs): def export_results(self, result): """ - Export a number of candidates to `checkpoints dir. + Export a number of candidates to `checkpoints` dir. Parameters ---------- From 9871fa6e7df0db97e6595693f0be1aa6f67fc088 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 24 Dec 2019 10:55:00 +0800 Subject: [PATCH 54/57] use enum string --- src/sdk/pynni/nni/nas/pytorch/spos/evolution.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py index 34d6246eb2..3541c81fd7 100644 --- a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py +++ b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py @@ -9,6 +9,7 @@ import numpy as np from nni.tuner import Tuner +from nni.nas.pytorch.classic_nas.mutator import LAYER_CHOICE, INPUT_CHOICE _logger = logging.getLogger(__name__) @@ -82,11 +83,11 @@ def _next_round(self): def _random_candidate(self): chosen_arch = dict() for key, val in self._search_space.items(): - if val["_type"] == "layer_choice": + if val["_type"] == LAYER_CHOICE: choices = val["_value"] index = self.random_state.randint(len(choices)) chosen_arch[key] = {"_value": choices[index], "_idx": index} - elif val["_type"] == "input_choice": + elif val["_type"] == INPUT_CHOICE: raise NotImplementedError("Input choice is not implemented yet.") return chosen_arch From 86b34e698c480bee38082d2f4be1122fa0e07d24 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 24 Dec 2019 11:16:43 +0800 Subject: [PATCH 55/57] add reproduction results --- examples/nas/spos/README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/nas/spos/README.md b/examples/nas/spos/README.md index 2fbc34590a..ae1e374b7d 100644 --- a/examples/nas/spos/README.md +++ b/examples/nas/spos/README.md @@ -6,14 +6,12 @@ Block search only. Channel search is not supported yet. Only GPU version is provided here. -TODO: Reproduction results. - ## Preparation ### Requirements * PyTorch >= 1.2 -* NVIDIA DALI >= 0.16 as we use DALI to accelerate the data loading of ImageNet. +* NVIDIA DALI >= 0.16 as we use DALI to accelerate the data loading of ImageNet. [Installation guide](https://docs.nvidia.com/deeplearning/sdk/dali-developer-guide/docs/installation.html) ### Data @@ -81,3 +79,10 @@ python scratch.py ``` By default, it will use `architecture_final.json`. This architecture is provided by the official repo (converted into NNI format). You can use any architecture (e.g., the architecture found in step 2) with `--fixed-arc` option. + +## Current Reproduction Results + +Reproduction is still undergoing. Due to the gap between official release and original paper, we compare our current results with official repo (our run) and paper. + +* Evolution phase is almost aligned with official repo. Our evolution algorithm shows a converging trend and reaches ~65% accuracy at the end of search. +* Retrain phase is not aligned. Our retraining code, which uses the architecture released by the authors, reaches 72.14% accuracy, still having a gap towards 73.61% by official release and 74.3% reported in original paper. From 7ba24b04cbd59e6d78399f21e3f28b055756779e Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 24 Dec 2019 11:18:08 +0800 Subject: [PATCH 56/57] add reproduction results --- examples/nas/spos/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nas/spos/README.md b/examples/nas/spos/README.md index ae1e374b7d..972552b255 100644 --- a/examples/nas/spos/README.md +++ b/examples/nas/spos/README.md @@ -84,5 +84,5 @@ By default, it will use `architecture_final.json`. This architecture is provided Reproduction is still undergoing. Due to the gap between official release and original paper, we compare our current results with official repo (our run) and paper. -* Evolution phase is almost aligned with official repo. Our evolution algorithm shows a converging trend and reaches ~65% accuracy at the end of search. +* Evolution phase is almost aligned with official repo. Our evolution algorithm shows a converging trend and reaches ~65% accuracy at the end of search. This result is not on par with paper. For details, please refer to [this issue](https://github.com/megvii-model/SinglePathOneShot/issues/6). * Retrain phase is not aligned. Our retraining code, which uses the architecture released by the authors, reaches 72.14% accuracy, still having a gap towards 73.61% by official release and 74.3% reported in original paper. From ba009a7997d9c4954c68151ae569641747aa6ef7 Mon Sep 17 00:00:00 2001 From: zhangyuge Date: Tue, 24 Dec 2019 11:19:28 +0800 Subject: [PATCH 57/57] add reproduction results --- examples/nas/spos/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nas/spos/README.md b/examples/nas/spos/README.md index 972552b255..ed239f30a1 100644 --- a/examples/nas/spos/README.md +++ b/examples/nas/spos/README.md @@ -84,5 +84,5 @@ By default, it will use `architecture_final.json`. This architecture is provided Reproduction is still undergoing. Due to the gap between official release and original paper, we compare our current results with official repo (our run) and paper. -* Evolution phase is almost aligned with official repo. Our evolution algorithm shows a converging trend and reaches ~65% accuracy at the end of search. This result is not on par with paper. For details, please refer to [this issue](https://github.com/megvii-model/SinglePathOneShot/issues/6). +* Evolution phase is almost aligned with official repo. Our evolution algorithm shows a converging trend and reaches ~65% accuracy at the end of search. Nevertheless, this result is not on par with paper. For details, please refer to [this issue](https://github.com/megvii-model/SinglePathOneShot/issues/6). * Retrain phase is not aligned. Our retraining code, which uses the architecture released by the authors, reaches 72.14% accuracy, still having a gap towards 73.61% by official release and 74.3% reported in original paper.