Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor CIFAR Example Based on the Latest AdaptDL #8

Merged
merged 5 commits into from
Nov 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion examples/trials/cifar-adaptdl/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime

COPY requirements.txt /requirements.txt
RUN pip install -r /requirements.txt

COPY ./ /cifar10/
RUN pip install -r /cifar10/requirements.txt
4 changes: 2 additions & 2 deletions examples/trials/cifar-adaptdl/config_adl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ maxExecDuration: 3h
maxTrialNum: 10
nniManagerIp: 10.20.100.199
trainingServicePlatform: adl
searchSpacePath: search_space_demo.json
searchSpacePath: search_space.json
logCollection: http
useAnnotation: false
tuner:
builtinTunerName: TPE
classArgs:
optimize_mode: minimize
trial:
command: python3 /cifar10/main_nni.py
command: python3 /cifar10/main.py
codeDir: .
gpuNum: 1
image: registry.petuum.com/dev/nni:cifar-example
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.


'''Train CIFAR10 with PyTorch.'''
'''
Train CIFAR10 with PyTorch and AdaptDL. This example is based on:
https://github.com/petuum/adaptdl/blob/master/examples/pytorch-cifar/main.py
'''
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist

import torchvision
import torchvision.transforms as transforms
Expand All @@ -29,23 +31,19 @@
from models import *

import adaptdl
import adaptdl.torch as et
import adaptdl.torch as adl

from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.tensorboard import SummaryWriter

import nni

IS_CHIEF = int(os.getenv("ADAPTDL_RANK", "0")) == 0


parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
parser.add_argument('--bs', default=128, type=int, help='batch size')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--epochs', default=30, type=int, help='number of epochs')
parser.add_argument('--model', default='ResNet18', type=str, help='model')
# TODO(tairui): change this back later
# parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=False, action='store_true', help='autoscale batchsize')
parser.add_argument('--autoscale-bsz', dest='autoscale_bsz', default=True, action='store_true', help='autoscale batchsize')
args = parser.parse_args()

Expand All @@ -69,30 +67,26 @@
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=True, transform=transform_train)
trainloader = et.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True)
adaptdl.torch.init_process_group("nccl" if torch.cuda.is_available() else "gloo")

if adaptdl.env.replica_rank() == 0:
trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=True, transform=transform_train)
trainloader = adl.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True)
dist.barrier() # We use a barrier here so that non-master replicas would wait for master to download the data
else:
dist.barrier()
trainset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=True, download=False, transform=transform_train)
trainloader = adl.AdaptiveDataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=2, drop_last=True)

if args.autoscale_bsz:
trainloader.autoscale_batch_size(4096, local_bsz_bounds=(32, 1024))
trainloader.autoscale_batch_size(4096, local_bsz_bounds=(32, 1024), gradient_accumulation=True)

validset = torchvision.datasets.CIFAR10(root=adaptdl.env.share_path(), train=False, download=False, transform=transform_test)
validloader = et.AdaptiveDataLoader(validset, batch_size=100, shuffle=False, num_workers=2)
validloader = adl.AdaptiveDataLoader(validset, batch_size=100, shuffle=False, num_workers=2)

# Model
print('==> Building model..')
net = eval(args.model)()
# net = VGG('VGG19')
# net = ResNet18()
# net = PreActResNet18()
# net = GoogLeNet()
# net = DenseNet121()
# net = ResNeXt29_2x64d()
# net = MobileNet()
# net = MobileNetV2()
# net = DPN92()
# net = ShuffleNetG2()
# net = SENet18()
# net = ShuffleNetV2(1)
net = net.to(device)
if device == 'cuda':
cudnn.benchmark = True
Expand All @@ -102,15 +96,13 @@
lr=args.lr, momentum=0.9, weight_decay=5e-4)
lr_scheduler = MultiStepLR(optimizer, [30, 45], 0.1)

adaptdl.torch.init_process_group("nccl" if torch.cuda.is_available()
else "gloo")
net = et.AdaptiveDataParallel(net, optimizer, lr_scheduler)
net = adl.AdaptiveDataParallel(net, optimizer, lr_scheduler)

# Training
def train(epoch):
print('\nEpoch: %d' % epoch)
net.train()
stats = et.Accumulator()
stats = adl.Accumulator()
for inputs, targets in trainloader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
Expand All @@ -124,21 +116,18 @@ def train(epoch):
stats["total"] += targets.size(0)
stats["correct"] += predicted.eq(targets).sum().item()

writer.add_scalar("Throughput/Gain", net.gain, epoch)
writer.add_scalar("Throughput/Global_Batchsize",
trainloader.current_batch_size, epoch)

trainloader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/")
net.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/")
with stats.synchronized():
stats["loss_avg"] = stats["loss_sum"] / stats["total"]
stats["accuracy"] = stats["correct"] / stats["total"]
writer.add_scalar("Loss/Train", stats["loss_avg"], epoch)
writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch)
writer.add_scalar("Config/Gpu", int(adaptdl.env.num_replicas()), epoch)
print("Train:", stats)

def valid(epoch):
net.eval()
stats = et.Accumulator()
stats = adl.Accumulator()
with torch.no_grad():
for inputs, targets in validloader:
inputs, targets = inputs.to(device), targets.to(device)
Expand All @@ -155,22 +144,28 @@ def valid(epoch):
stats["accuracy"] = stats["correct"] / stats["total"]
writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch)
writer.add_scalar("Accuracy/Valid", stats["accuracy"], epoch)
if IS_CHIEF:
nni.report_intermediate_result(stats["loss_avg"], accum=stats)

if adaptdl.env.replica_rank() == 0:
nni.report_intermediate_result(stats["accuracy"], accum=stats)

print("Valid:", stats)
return stats["loss_avg"]
return stats["accuracy"]


# TODO BE-12547: Rename ADAPTDLCTL_TENSORBOARD_LOGDIR to ADAPTDL_TENSORBOARD_LOGDIR
tensorboard_dir = os.path.join(
os.getenv("ADAPTDLCTL_TENSORBOARD_LOGDIR", "/adaptdl/tensorboard"),
os.getenv("NNI_TRIAL_JOB_ID", "cifar-adaptdl")
)
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)

with SummaryWriter(tensorboard_dir) as writer:
for epoch in et.remaining_epochs_until(args.epochs):
acc = 0
for epoch in adl.remaining_epochs_until(args.epochs):
train(epoch)
avg_loss = valid(epoch)
acc = valid(epoch)
lr_scheduler.step()
if IS_CHIEF:
nni.report_final_result(avg_loss)

if adaptdl.env.replica_rank() == 0:
nni.report_final_result(acc)