diff --git a/examples/trials/cifar-adaptdl/Dockerfile b/examples/trials/cifar-adaptdl/Dockerfile index d972fef7d5..e74a57d67f 100644 --- a/examples/trials/cifar-adaptdl/Dockerfile +++ b/examples/trials/cifar-adaptdl/Dockerfile @@ -1,4 +1,6 @@ FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime +COPY requirements.txt /requirements.txt +RUN pip install -r /requirements.txt + COPY ./ /cifar10/ -RUN pip install -r /cifar10/requirements.txt diff --git a/examples/trials/cifar-adaptdl/config_adl.yml b/examples/trials/cifar-adaptdl/config_adl.yml index 2a8332428e..08c7e27a35 100644 --- a/examples/trials/cifar-adaptdl/config_adl.yml +++ b/examples/trials/cifar-adaptdl/config_adl.yml @@ -5,7 +5,7 @@ maxExecDuration: 3h maxTrialNum: 10 nniManagerIp: 10.20.100.199 trainingServicePlatform: adl -searchSpacePath: search_space_demo.json +searchSpacePath: search_space.json logCollection: http useAnnotation: false tuner: @@ -13,7 +13,7 @@ tuner: classArgs: optimize_mode: minimize trial: - command: python3 /cifar10/main_nni.py + command: python3 /cifar10/main.py codeDir: . gpuNum: 1 image: registry.petuum.com/dev/nni:cifar-example diff --git a/examples/trials/cifar-adaptdl/main_nni.py b/examples/trials/cifar-adaptdl/main.py similarity index 73% rename from examples/trials/cifar-adaptdl/main_nni.py rename to examples/trials/cifar-adaptdl/main.py index d4e04644ac..66181ab7dc 100644 --- a/examples/trials/cifar-adaptdl/main_nni.py +++ b/examples/trials/cifar-adaptdl/main.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. - -'''Train CIFAR10 with PyTorch.''' +''' +Train CIFAR10 with PyTorch and AdaptDL. This example is based on: +https://github.com/petuum/adaptdl/blob/master/examples/pytorch-cifar/main.py +''' import torch import torch.nn as nn import torch.optim as optim -import torch.nn.functional as F import torch.backends.cudnn as cudnn +import torch.distributed as dist import torchvision import torchvision.transforms as transforms @@ -29,23 +31,19 @@ from models import * import adaptdl -import adaptdl.torch as et +import adaptdl.torch as adl from torch.optim.lr_scheduler import MultiStepLR from torch.utils.tensorboard import SummaryWriter import nni -IS_CHIEF = int(os.getenv("ADAPTDL_RANK", "0")) == 0 - parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') parser.add_argument('--bs', default=128, type=int, help='batch size') parser.add_argument('--lr', default=0.1, type=float, help='learning rate') parser.add_argument('--epochs', default=30, type=int, help='number of epochs') parser.add_argument('--model', default='ResNet18', type=str, help='model') -# TODO(tairui): change this back later -# parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=False, action='store_true', help='autoscale batchsize') parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=True, action='store_true', help='autoscale batchsize') args = parser.parse_args() @@ -69,30 +67,26 @@ transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) -trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=True, transform=transform_train) -trainloader = et.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True) +adaptdl.torch.init_process_group("nccl" if torch.cuda.is_available() else "gloo") + +if adaptdl.env.replica_rank() == 0: + trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=True, transform=transform_train) + trainloader = adl.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True) + dist.barrier() # We use a barrier here so that non-master replicas would wait for master to download the data +else: + dist.barrier() + trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=False, transform=transform_train) + trainloader = adl.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True) if args.autoscale_bsz: - trainloader.autoscale_batch_size(4096, local_bsz_bounds=(32, 1024)) + trainloader.autoscale_batch_size(4096, local_bsz_bounds=(32, 1024), gradient_accumulation=True) validset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=False, download=False, transform=transform_test) -validloader = et.AdaptiveDataLoader(validset, batch_size=100, shuffle=False, num_workers=2) +validloader = adl.AdaptiveDataLoader(validset, batch_size=100, shuffle=False, num_workers=2) # Model print('==> Building model..') net = eval(args.model)() -# net = VGG('VGG19') -# net = ResNet18() -# net = PreActResNet18() -# net = GoogLeNet() -# net = DenseNet121() -# net = ResNeXt29_2x64d() -# net = MobileNet() -# net = MobileNetV2() -# net = DPN92() -# net = ShuffleNetG2() -# net = SENet18() -# net = ShuffleNetV2(1) net = net.to(device) if device == 'cuda': cudnn.benchmark = True @@ -102,15 +96,13 @@ lr=args.lr, momentum=0.9, weight_decay=5e-4) lr_scheduler = MultiStepLR(optimizer, [30, 45], 0.1) -adaptdl.torch.init_process_group("nccl" if torch.cuda.is_available() - else "gloo") -net = et.AdaptiveDataParallel(net, optimizer, lr_scheduler) +net = adl.AdaptiveDataParallel(net, optimizer, lr_scheduler) # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() - stats = et.Accumulator() + stats = adl.Accumulator() for inputs, targets in trainloader: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() @@ -124,21 +116,18 @@ def train(epoch): stats["total"] += targets.size(0) stats["correct"] += predicted.eq(targets).sum().item() - writer.add_scalar("Throughput/Gain", net.gain, epoch) - writer.add_scalar("Throughput/Global_Batchsize", - trainloader.current_batch_size, epoch) - + trainloader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/") + net.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/") with stats.synchronized(): stats["loss_avg"] = stats["loss_sum"] / stats["total"] stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Train", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch) - writer.add_scalar("Config/Gpu", int(adaptdl.env.num_replicas()), epoch) print("Train:", stats) def valid(epoch): net.eval() - stats = et.Accumulator() + stats = adl.Accumulator() with torch.no_grad(): for inputs, targets in validloader: inputs, targets = inputs.to(device), targets.to(device) @@ -155,22 +144,28 @@ def valid(epoch): stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Valid", stats["accuracy"], epoch) - if IS_CHIEF: - nni.report_intermediate_result(stats["loss_avg"], accum=stats) + + if adaptdl.env.replica_rank() == 0: + nni.report_intermediate_result(stats["accuracy"], accum=stats) + print("Valid:", stats) - return stats["loss_avg"] + return stats["accuracy"] +# TODO BE-12547: Rename ADAPTDLCTL_TENSORBOARD_LOGDIR to ADAPTDL_TENSORBOARD_LOGDIR tensorboard_dir = os.path.join( os.getenv("ADAPTDLCTL_TENSORBOARD_LOGDIR", "/adaptdl/tensorboard"), os.getenv("NNI_TRIAL_JOB_ID", "cifar-adaptdl") ) if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) + with SummaryWriter(tensorboard_dir) as writer: - for epoch in et.remaining_epochs_until(args.epochs): + acc = 0 + for epoch in adl.remaining_epochs_until(args.epochs): train(epoch) - avg_loss = valid(epoch) + acc = valid(epoch) lr_scheduler.step() - if IS_CHIEF: - nni.report_final_result(avg_loss) + + if adaptdl.env.replica_rank() == 0: + nni.report_final_result(acc) diff --git a/examples/trials/cifar-adaptdl/search_space_demo.json b/examples/trials/cifar-adaptdl/search_space.json similarity index 100% rename from examples/trials/cifar-adaptdl/search_space_demo.json rename to examples/trials/cifar-adaptdl/search_space.json