From ed1c32d4113f6025cfec6c3c69a6c2a7bb1eb289 Mon Sep 17 00:00:00 2001 From: hao-howard-zhang Date: Thu, 19 Nov 2020 15:05:11 -0500 Subject: [PATCH 1/5] refactor cifar example based on latest adaptdl --- examples/trials/cifar-adaptdl/config_adl.yml | 4 +- .../cifar-adaptdl/{main_nni.py => main.py} | 90 +++++++------------ ...arch_space_demo.json => search_space.json} | 0 3 files changed, 36 insertions(+), 58 deletions(-) rename examples/trials/cifar-adaptdl/{main_nni.py => main.py} (63%) rename examples/trials/cifar-adaptdl/{search_space_demo.json => search_space.json} (100%) diff --git a/examples/trials/cifar-adaptdl/config_adl.yml b/examples/trials/cifar-adaptdl/config_adl.yml index 2a8332428e..08c7e27a35 100644 --- a/examples/trials/cifar-adaptdl/config_adl.yml +++ b/examples/trials/cifar-adaptdl/config_adl.yml @@ -5,7 +5,7 @@ maxExecDuration: 3h maxTrialNum: 10 nniManagerIp: 10.20.100.199 trainingServicePlatform: adl -searchSpacePath: search_space_demo.json +searchSpacePath: search_space.json logCollection: http useAnnotation: false tuner: @@ -13,7 +13,7 @@ tuner: classArgs: optimize_mode: minimize trial: - command: python3 /cifar10/main_nni.py + command: python3 /cifar10/main.py codeDir: . gpuNum: 1 image: registry.petuum.com/dev/nni:cifar-example diff --git a/examples/trials/cifar-adaptdl/main_nni.py b/examples/trials/cifar-adaptdl/main.py similarity index 63% rename from examples/trials/cifar-adaptdl/main_nni.py rename to examples/trials/cifar-adaptdl/main.py index d4e04644ac..b3d02b1a03 100644 --- a/examples/trials/cifar-adaptdl/main_nni.py +++ b/examples/trials/cifar-adaptdl/main.py @@ -1,24 +1,9 @@ -# Copyright 2020 Petuum, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -'''Train CIFAR10 with PyTorch.''' +'''Train CIFAR10 with PyTorch and AdaptDL. This NNI-AdaptDL example is based on ''' import torch import torch.nn as nn import torch.optim as optim -import torch.nn.functional as F import torch.backends.cudnn as cudnn +import torch.distributed as dist import torchvision import torchvision.transforms as transforms @@ -29,24 +14,20 @@ from models import * import adaptdl -import adaptdl.torch as et +import adaptdl.torch as adl from torch.optim.lr_scheduler import MultiStepLR from torch.utils.tensorboard import SummaryWriter import nni -IS_CHIEF = int(os.getenv("ADAPTDL_RANK", "0")) == 0 - parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') parser.add_argument('--bs', default=128, type=int, help='batch size') parser.add_argument('--lr', default=0.1, type=float, help='learning rate') parser.add_argument('--epochs', default=30, type=int, help='number of epochs') parser.add_argument('--model', default='ResNet18', type=str, help='model') -# TODO(tairui): change this back later -# parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=False, action='store_true', help='autoscale batchsize') -parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=True, action='store_true', help='autoscale batchsize') +parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=False, action='store_true', help='autoscale batchsize') args = parser.parse_args() # load the parameters from nni @@ -69,30 +50,26 @@ transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) -trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=True, transform=transform_train) -trainloader = et.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True) +adaptdl.torch.init_process_group("nccl" if torch.cuda.is_available() else "gloo") + +if adaptdl.env.replica_rank() == 0: + trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=True, transform=transform_train) + trainloader = adl.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True) + dist.barrier() # We use a barrier here so that non-master replicas would wait for master to download the data +else: + dist.barrier() + trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=False, transform=transform_train) + trainloader = adl.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True) if args.autoscale_bsz: - trainloader.autoscale_batch_size(4096, local_bsz_bounds=(32, 1024)) + trainloader.autoscale_batch_size(4096, local_bsz_bounds=(32, 1024), gradient_accumulation=True) validset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=False, download=False, transform=transform_test) -validloader = et.AdaptiveDataLoader(validset, batch_size=100, shuffle=False, num_workers=2) +validloader = adl.AdaptiveDataLoader(validset, batch_size=100, shuffle=False, num_workers=2) # Model print('==> Building model..') net = eval(args.model)() -# net = VGG('VGG19') -# net = ResNet18() -# net = PreActResNet18() -# net = GoogLeNet() -# net = DenseNet121() -# net = ResNeXt29_2x64d() -# net = MobileNet() -# net = MobileNetV2() -# net = DPN92() -# net = ShuffleNetG2() -# net = SENet18() -# net = ShuffleNetV2(1) net = net.to(device) if device == 'cuda': cudnn.benchmark = True @@ -102,15 +79,13 @@ lr=args.lr, momentum=0.9, weight_decay=5e-4) lr_scheduler = MultiStepLR(optimizer, [30, 45], 0.1) -adaptdl.torch.init_process_group("nccl" if torch.cuda.is_available() - else "gloo") -net = et.AdaptiveDataParallel(net, optimizer, lr_scheduler) +net = adl.AdaptiveDataParallel(net, optimizer, lr_scheduler) # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() - stats = et.Accumulator() + stats = adl.Accumulator() for inputs, targets in trainloader: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() @@ -124,21 +99,18 @@ def train(epoch): stats["total"] += targets.size(0) stats["correct"] += predicted.eq(targets).sum().item() - writer.add_scalar("Throughput/Gain", net.gain, epoch) - writer.add_scalar("Throughput/Global_Batchsize", - trainloader.current_batch_size, epoch) - + trainloader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/") + net.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/") with stats.synchronized(): stats["loss_avg"] = stats["loss_sum"] / stats["total"] stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Train", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch) - writer.add_scalar("Config/Gpu", int(adaptdl.env.num_replicas()), epoch) print("Train:", stats) def valid(epoch): net.eval() - stats = et.Accumulator() + stats = adl.Accumulator() with torch.no_grad(): for inputs, targets in validloader: inputs, targets = inputs.to(device), targets.to(device) @@ -155,22 +127,28 @@ def valid(epoch): stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Valid", stats["accuracy"], epoch) - if IS_CHIEF: - nni.report_intermediate_result(stats["loss_avg"], accum=stats) print("Valid:", stats) - return stats["loss_avg"] + return stats +# TODO BE-12547: Rename ADAPTDLCTL_TENSORBOARD_LOGDIR to ADAPTDL_TENSORBOARD_LOGDIR tensorboard_dir = os.path.join( os.getenv("ADAPTDLCTL_TENSORBOARD_LOGDIR", "/adaptdl/tensorboard"), os.getenv("NNI_TRIAL_JOB_ID", "cifar-adaptdl") ) if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) + with SummaryWriter(tensorboard_dir) as writer: - for epoch in et.remaining_epochs_until(args.epochs): + acc = 0 + for epoch in adl.remaining_epochs_until(args.epochs): train(epoch) - avg_loss = valid(epoch) + stats = valid(epoch) + acc = stats["accuracy"] + if adaptdl.env.replica_rank() == 0: + nni.report_intermediate_result(acc, accum=stats) + lr_scheduler.step() - if IS_CHIEF: - nni.report_final_result(avg_loss) + + if adaptdl.env.replica_rank() == 0: + nni.report_final_result(acc) diff --git a/examples/trials/cifar-adaptdl/search_space_demo.json b/examples/trials/cifar-adaptdl/search_space.json similarity index 100% rename from examples/trials/cifar-adaptdl/search_space_demo.json rename to examples/trials/cifar-adaptdl/search_space.json From 54a16e1a44aebe9a124a5821bbc11950bcd4d386 Mon Sep 17 00:00:00 2001 From: hao-howard-zhang Date: Thu, 19 Nov 2020 15:43:30 -0500 Subject: [PATCH 2/5] fix accumulator --- examples/trials/cifar-adaptdl/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/trials/cifar-adaptdl/main.py b/examples/trials/cifar-adaptdl/main.py index b3d02b1a03..6d196fe9a7 100644 --- a/examples/trials/cifar-adaptdl/main.py +++ b/examples/trials/cifar-adaptdl/main.py @@ -127,8 +127,12 @@ def valid(epoch): stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Valid", stats["accuracy"], epoch) + + if adaptdl.env.replica_rank() == 0: + nni.report_intermediate_result(stats["accuracy"], accum=stats) + print("Valid:", stats) - return stats + return stats["accuracy"] # TODO BE-12547: Rename ADAPTDLCTL_TENSORBOARD_LOGDIR to ADAPTDL_TENSORBOARD_LOGDIR @@ -143,11 +147,7 @@ def valid(epoch): acc = 0 for epoch in adl.remaining_epochs_until(args.epochs): train(epoch) - stats = valid(epoch) - acc = stats["accuracy"] - if adaptdl.env.replica_rank() == 0: - nni.report_intermediate_result(acc, accum=stats) - + acc = valid(epoch) lr_scheduler.step() if adaptdl.env.replica_rank() == 0: From 93f86945b2968c4ca7d994a5a626a48c81e751aa Mon Sep 17 00:00:00 2001 From: hao-howard-zhang Date: Thu, 19 Nov 2020 15:43:49 -0500 Subject: [PATCH 3/5] reuse image cache layer --- examples/trials/cifar-adaptdl/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/trials/cifar-adaptdl/Dockerfile b/examples/trials/cifar-adaptdl/Dockerfile index d972fef7d5..e74a57d67f 100644 --- a/examples/trials/cifar-adaptdl/Dockerfile +++ b/examples/trials/cifar-adaptdl/Dockerfile @@ -1,4 +1,6 @@ FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime +COPY requirements.txt /requirements.txt +RUN pip install -r /requirements.txt + COPY ./ /cifar10/ -RUN pip install -r /cifar10/requirements.txt From 156315c9a5a22e42386f0e731320a5bcb18b310c Mon Sep 17 00:00:00 2001 From: hao-howard-zhang Date: Thu, 19 Nov 2020 16:03:12 -0500 Subject: [PATCH 4/5] enable autoscale bsz --- examples/trials/cifar-adaptdl/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/cifar-adaptdl/main.py b/examples/trials/cifar-adaptdl/main.py index 6d196fe9a7..c81e1f5f0b 100644 --- a/examples/trials/cifar-adaptdl/main.py +++ b/examples/trials/cifar-adaptdl/main.py @@ -27,7 +27,7 @@ parser.add_argument('--lr', default=0.1, type=float, help='learning rate') parser.add_argument('--epochs', default=30, type=int, help='number of epochs') parser.add_argument('--model', default='ResNet18', type=str, help='model') -parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=False, action='store_true', help='autoscale batchsize') +parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=True, action='store_true', help='autoscale batchsize') args = parser.parse_args() # load the parameters from nni From 4693956874d829e5f705f56bb8d1b2fb192f28f4 Mon Sep 17 00:00:00 2001 From: hao-howard-zhang Date: Thu, 19 Nov 2020 17:20:56 -0500 Subject: [PATCH 5/5] add copyright --- examples/trials/cifar-adaptdl/main.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/examples/trials/cifar-adaptdl/main.py b/examples/trials/cifar-adaptdl/main.py index c81e1f5f0b..66181ab7dc 100644 --- a/examples/trials/cifar-adaptdl/main.py +++ b/examples/trials/cifar-adaptdl/main.py @@ -1,4 +1,21 @@ -'''Train CIFAR10 with PyTorch and AdaptDL. This NNI-AdaptDL example is based on ''' +# Copyright 2020 Petuum, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Train CIFAR10 with PyTorch and AdaptDL. This example is based on: +https://github.com/petuum/adaptdl/blob/master/examples/pytorch-cifar/main.py +''' import torch import torch.nn as nn import torch.optim as optim