diff --git a/configs/seg/cityscapes/fs_deeplabv3_cityscapes_seg.conf b/configs/seg/cityscapes/fs_deeplabv3_cityscapes_seg.conf index 404f0cd..9bd7b65 100755 --- a/configs/seg/cityscapes/fs_deeplabv3_cityscapes_seg.conf +++ b/configs/seg/cityscapes/fs_deeplabv3_cityscapes_seg.conf @@ -75,7 +75,7 @@ "ms_test":{ "scale_search": [0.75, 1.0, 1.25] }, - "mode": "mscrop_test" + "mode": "ss_test" }, "details": { "color_list": [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], [190, 153, 153], diff --git a/datasets/seg/data_loader.py b/datasets/seg/data_loader.py index 4621263..e031162 100755 --- a/datasets/seg/data_loader.py +++ b/datasets/seg/data_loader.py @@ -4,6 +4,7 @@ # Class for the Semantic Segmentation Data Loader. +import torch from torch.utils import data from datasets.seg.loader.default_loader import DefaultLoader @@ -45,13 +46,18 @@ def __init__(self, configer): def get_trainloader(self): if self.configer.get('train.loader', default=None) in [None, 'default']: - trainloader = data.DataLoader( - DefaultLoader(root_dir=self.configer.get('data', 'data_dir'), dataset='train', + dataset = DefaultLoader(root_dir=self.configer.get('data', 'data_dir'), dataset='train', aug_transform=self.aug_train_transform, img_transform=self.img_transform, label_transform=self.label_transform, - configer=self.configer), - batch_size=self.configer.get('train', 'batch_size'), shuffle=True, + configer=self.configer) + sampler = None + if self.configer.get('network.distributed'): + sampler = torch.utils.data.distributed.DistributedSampler(dataset) + + trainloader = data.DataLoader( + dataset, sampler=sampler, + batch_size=self.configer.get('train', 'batch_size'), shuffle=(sampler is None), num_workers=self.configer.get('data', 'workers'), pin_memory=True, drop_last=self.configer.get('data', 'drop_last'), collate_fn=lambda *args: collate( @@ -65,15 +71,19 @@ def get_trainloader(self): Log.error('{} train loader is invalid.'.format(self.configer.get('train', 'loader'))) exit(1) - def get_valloader(self, dataset=None): - dataset = 'val' if dataset is None else dataset + def get_valloader(self): if self.configer.get('val.loader', default=None) in [None, 'default']: + dataset = DefaultLoader(root_dir=self.configer.get('data', 'data_dir'), dataset='val', + aug_transform=self.aug_val_transform, + img_transform=self.img_transform, + label_transform=self.label_transform, + configer=self.configer) + sampler = None + if self.configer.get('network.distributed'): + sampler = torch.utils.data.distributed.DistributedSampler(dataset) + valloader = data.DataLoader( - DefaultLoader(root_dir=self.configer.get('data', 'data_dir'), dataset=dataset, - aug_transform=self.aug_val_transform, - img_transform=self.img_transform, - label_transform=self.label_transform, - configer=self.configer), + dataset, sampler=sampler, batch_size=self.configer.get('val', 'batch_size'), shuffle=False, num_workers=self.configer.get('data', 'workers'), pin_memory=True, collate_fn=lambda *args: collate( diff --git a/datasets/test/test_data_loader.py b/datasets/test/test_data_loader.py index d9ad400..76b53c1 100644 --- a/datasets/test/test_data_loader.py +++ b/datasets/test/test_data_loader.py @@ -34,7 +34,7 @@ def __init__(self, configer): Normalize(**self.configer.get('data', 'normalize')), ]) def get_testloader(self, test_dir=None, list_path=None, json_path=None): - if not self.configer.exists('test', 'loader') or self.configer.get('test', 'loader') == 'default': + if self.configer.get('test.loader', default=None) in [None, 'default']: test_dir = test_dir if test_dir is not None else self.configer.get('test', 'test_dir') testloader = data.DataLoader( DefaultLoader(test_dir=test_dir, diff --git a/main.py b/main.py index 8eafe24..52a9477 100755 --- a/main.py +++ b/main.py @@ -67,6 +67,8 @@ def str2bool(v): dest='network.backbone', help='The base network of model.') parser.add_argument('--norm_type', default=None, type=str, dest='network.norm_type', help='The BN type of the network.') + parser.add_argument('--syncbn', type=str2bool, nargs='?', default=False, + dest='network.syncbn', help='Whether to sync BN.') parser.add_argument('--pretrained', type=str, default=None, dest='network.pretrained', help='The path to pretrained model.') parser.add_argument('--resume', default=None, type=str, @@ -79,6 +81,8 @@ def str2bool(v): dest='network.resume_val', help='Whether to validate during resume.') parser.add_argument('--gather', type=str2bool, nargs='?', default=True, dest='network.gather', help='Whether to gather the output of model.') + parser.add_argument('--distributed', type=str2bool, nargs='?', default=False, + dest='network.distributed', help='Whether to gather the output of model.') # *********** Params for solver. ********** parser.add_argument('--optim_method', default=None, type=str, @@ -127,27 +131,28 @@ def str2bool(v): # *********** Params for env. ********** parser.add_argument('--seed', default=None, type=int, help='manual seed') parser.add_argument('--cudnn', type=str2bool, nargs='?', default=True, help='Use CUDNN.') + parser.add_argument("--local_rank", default=0, type=int) - args_parser = parser.parse_args() + args = parser.parse_args() + configer = Configer(args_parser=args) - if args_parser.seed is not None: - random.seed(args_parser.seed) - torch.manual_seed(args_parser.seed) + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) cudnn.enabled = True - cudnn.benchmark = args_parser.cudnn + cudnn.benchmark = args.cudnn - configer = Configer(args_parser=args_parser) abs_data_dir = os.path.expanduser(configer.get('data', 'data_dir')) configer.update('data.data_dir', abs_data_dir) - if configer.get('gpu') is not None: + if configer.get('gpu') is not None and not configer.get('network.distributed', default=False): os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(gpu_id) for gpu_id in configer.get('gpu')) if configer.get('network', 'norm_type') is None: configer.update('network.norm_type', 'batchnorm') - if len(configer.get('gpu')) == 1 or len(range(torch.cuda.device_count())) == 1: + if torch.cuda.device_count() <= 1 or configer.get('network.distributed', default=False): configer.update('network.gather', True) if configer.get('phase') == 'train': @@ -171,6 +176,7 @@ def str2bool(v): Log.info('BN Type is {}.'.format(configer.get('network', 'norm_type'))) Log.info('Config Dict: {}'.format(json.dumps(configer.to_dict(), indent=2))) + runner_selector = RunnerSelector(configer) runner = None if configer.get('task') == 'pose': @@ -186,7 +192,6 @@ def str2bool(v): else: Log.error('Task: {} is not valid.'.format(configer.get('task'))) exit(1) - if configer.get('phase') == 'train': if configer.get('network', 'resume') is None: Controller.init(runner) diff --git a/model/seg/nets/deeplabv3.py b/model/seg/nets/deeplabv3.py index dea0a89..7957fbe 100755 --- a/model/seg/nets/deeplabv3.py +++ b/model/seg/nets/deeplabv3.py @@ -88,6 +88,9 @@ def forward(self, data_dict): x = F.interpolate(x, size=(data_dict['img'].size(2), data_dict['img'].size(3)), mode="bilinear", align_corners=True) out_dict = dict(dsn_out=x_dsn, out=x) + if self.configer.get('phase') == 'test': + return out_dict + loss_dict = dict() if 'dsn_ce_loss' in self.valid_loss_dict: loss_dict['dsn_ce_loss'] = dict( diff --git a/model/seg/nets/denseassp.py b/model/seg/nets/denseassp.py index 0add411..90240e2 100644 --- a/model/seg/nets/denseassp.py +++ b/model/seg/nets/denseassp.py @@ -96,6 +96,9 @@ def forward(self, data_dict): x = F.interpolate(x, size=(data_dict['img'].size(2), data_dict['img'].size(3)), mode="bilinear", align_corners=True) out_dict = dict(out=x) + if self.configer.get('phase') == 'test': + return out_dict + loss_dict = dict() if 'ce_loss' in self.valid_loss_dict: loss_dict['ce_loss'] = dict( diff --git a/model/seg/nets/pspnet.py b/model/seg/nets/pspnet.py index 94d83e5..5436caf 100644 --- a/model/seg/nets/pspnet.py +++ b/model/seg/nets/pspnet.py @@ -92,6 +92,9 @@ def forward(self, data_dict): x = F.interpolate(x, size=(data_dict['img'].size(2), data_dict['img'].size(3)), mode="bilinear", align_corners=True) out_dict = dict(dsn_out=x_dsn, out=x) + if self.configer.get('phase') == 'test': + return out_dict + loss_dict = dict() if 'dsn_ce_loss' in self.valid_loss_dict: loss_dict['dsn_ce_loss'] = dict( diff --git a/model/tools/module_helper.py b/model/tools/module_helper.py index 3b88cdc..c94e65c 100644 --- a/model/tools/module_helper.py +++ b/model/tools/module_helper.py @@ -24,12 +24,6 @@ def BNReLU(num_features, norm_type=None, **kwargs): nn.BatchNorm2d(num_features, **kwargs), nn.ReLU() ) - elif norm_type == 'sync_batchnorm': - from exts.ops.sync_bn.syncbn import BatchNorm2d - return nn.Sequential( - BatchNorm2d(num_features, **kwargs), - nn.ReLU() - ) elif norm_type == 'encsync_batchnorm': from encoding.nn import BatchNorm2d return nn.Sequential( @@ -41,9 +35,6 @@ def BNReLU(num_features, norm_type=None, **kwargs): nn.InstanceNorm2d(num_features, **kwargs), nn.ReLU() ) - # elif bn_type == 'inplace_abn': - # from extensions.ops.inplace_abn.bn import InPlaceABNSync - # return InPlaceABNSync(num_features, **kwargs) else: Log.error('Not support BN type: {}.'.format(norm_type)) exit(1) @@ -53,10 +44,6 @@ def BatchNorm3d(norm_type=None, ret_cls=False): if norm_type == 'batchnorm': return nn.BatchNorm3d - elif norm_type == 'sync_batchnorm': - from exts.ops.sync_bn.syncbn import BatchNorm3d - return BatchNorm3d - elif norm_type == 'encsync_batchnorm': from encoding.nn import BatchNorm3d return BatchNorm3d @@ -79,10 +66,6 @@ def BatchNorm2d(norm_type=None, ret_cls=False): if norm_type == 'batchnorm': return nn.BatchNorm2d - elif norm_type == 'sync_batchnorm': - from exts.ops.sync_bn.syncbn import BatchNorm2d - return BatchNorm2d - elif norm_type == 'encsync_batchnorm': from encoding.nn import BatchNorm2d return BatchNorm2d @@ -105,10 +88,6 @@ def BatchNorm1d(norm_type=None, ret_cls=False): if norm_type == 'batchnorm': return nn.BatchNorm1d - elif norm_type == 'sync_batchnorm': - from exts.ops.sync_bn.syncbn import BatchNorm1d - return BatchNorm1d - elif norm_type == 'encsync_batchnorm': from encoding.nn import BatchNorm1d return BatchNorm1d @@ -127,7 +106,7 @@ def BatchNorm1d(norm_type=None, ret_cls=False): exit(1) @staticmethod - def load_model(model, pretrained=None, all_match=True): + def load_model(model, pretrained=None, all_match=True, map_location='cpu'): if pretrained is None: return model @@ -137,7 +116,7 @@ def load_model(model, pretrained=None, all_match=True): Log.info('Loading pretrained model:{}'.format(pretrained)) if all_match: - pretrained_dict = torch.load(pretrained) + pretrained_dict = torch.load(pretrained, map_location=map_location) model_dict = model.state_dict() load_dict = dict() for k, v in pretrained_dict.items(): @@ -146,7 +125,6 @@ def load_model(model, pretrained=None, all_match=True): else: load_dict[k] = v - # load_dict = {k: v for k, v in pretrained_dict.items() if 'resinit.{}'.format(k) not in model_dict} model.load_state_dict(load_dict) else: @@ -161,7 +139,7 @@ def load_model(model, pretrained=None, all_match=True): @staticmethod def load_url(url, map_location=None): - model_dir = os.path.join('~', '.PyTorchCV', 'model') + model_dir = os.path.join('~', '.TorchCV', 'model') if not os.path.exists(model_dir): os.makedirs(model_dir) diff --git a/runner/seg/fcn_segmentor.py b/runner/seg/fcn_segmentor.py index c7de6fa..da831cf 100755 --- a/runner/seg/fcn_segmentor.py +++ b/runner/seg/fcn_segmentor.py @@ -82,6 +82,7 @@ def train(self): self.data_time.update(time.time() - start_time) # Forward pass. + data_dict = RunnerHelper.to_device(self, data_dict) out = self.seg_net(data_dict) # Compute the loss of the train batch & backward. loss_dict = self.loss(out) @@ -132,7 +133,7 @@ def val(self, data_loader=None): data_loader = self.val_loader if data_loader is None else data_loader for j, data_dict in enumerate(data_loader): - + data_dict = RunnerHelper.to_device(self, data_dict) with torch.no_grad(): # Forward pass. out = self.seg_net(data_dict) diff --git a/runner/seg/fcn_segmentor_test.py b/runner/seg/fcn_segmentor_test.py index 4aaf64e..a1bcc71 100755 --- a/runner/seg/fcn_segmentor_test.py +++ b/runner/seg/fcn_segmentor_test.py @@ -69,10 +69,10 @@ def test(self, test_dir, out_dir): ImageHelper.save(image_canvas, save_path=os.path.join(out_dir, 'vis/{}.png'.format(meta_list[i]['filename']))) - if self.configer.exists('data', 'label_list'): + if self.configer.get('data.label_list', default=None) is not None: label_img = self.__relabel(label_img) - if self.configer.exists('data', 'reduce_zero_label') and self.configer.get('data', 'reduce_zero_label'): + if self.configer.get('data.reduce_zero_label', default=False): label_img = label_img + 1 label_img = label_img.astype(np.uint8) diff --git a/runner/tools/blob_helper.py b/runner/tools/blob_helper.py index f5c693d..09db231 100644 --- a/runner/tools/blob_helper.py +++ b/runner/tools/blob_helper.py @@ -30,7 +30,7 @@ def get_blob(self, data_dict, scale=None, flip=False): if flip: image = image.flip([2]) - if self.configer.exists('test', 'fit_stride'): + if self.configer.get('test.fit_stride', default=0) > 0: stride = self.configer.get('test', 'fit_stride') pad_w = 0 if (border_hw[1] % stride == 0) else stride - (border_hw[1] % stride) # right diff --git a/runner/tools/controller.py b/runner/tools/controller.py index dc44a3d..d3ff342 100644 --- a/runner/tools/controller.py +++ b/runner/tools/controller.py @@ -31,12 +31,18 @@ def train(runner): if runner.configer.get('solver', 'lr')['metric'] == 'epoch': while runner.runner_state['epoch'] < runner.configer.get('solver', 'max_epoch'): + if runner.configer.get('network.distributed'): + runner.train_loader.sampler.set_epoch(runner.runner_state['epoch']) + runner.train() if runner.runner_state['epoch'] == runner.configer.get('solver', 'max_epoch'): runner.val() break else: while runner.runner_state['iters'] < runner.configer.get('solver', 'max_iters'): + if runner.configer.get('network.distributed'): + runner.train_loader.sampler.set_epoch(runner.runner_state['epoch']) + runner.train() if runner.runner_state['iters'] == runner.configer.get('solver', 'max_iters'): runner.val() diff --git a/runner/tools/runner_helper.py b/runner/tools/runner_helper.py index 03da311..fb1efc0 100755 --- a/runner/tools/runner_helper.py +++ b/runner/tools/runner_helper.py @@ -29,22 +29,27 @@ def to_device(runner, in_data): @staticmethod def _make_parallel(runner, net): - parallel_type = runner.configer.get('network.parallel', default='dp') - if parallel_type == 'dp': + if runner.configer.get('network.distributed', default=False): + from apex.parallel import DistributedDataParallel + torch.cuda.set_device(runner.configer.get('local_rank')) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + net = DistributedDataParallel(net.cuda(), delay_allreduce=True) + return net + else: + net = net.to(torch.device('cpu' if runner.configer.get('gpu') is None else 'cuda')) from exts.tools.parallel.data_parallel import ParallelModel return ParallelModel(net, gather_=runner.configer.get('network', 'gather')) - elif parallel_type == 'ddp': - from exts.tools.parallel.data_parallel import DistributeParallelModel - return DistributeParallelModel(net, gather_=runner.configer.get('network', 'gather')) - else: - raise ValueError('Not support DataParallel: {}'.format(parallel_type)) @staticmethod def load_net(runner, net, model_path=None): + if runner.configer.get('network.syncbn', default=False): + Log.info('Converting syncbn model...') + from apex.parallel import convert_syncbn_model + net = convert_syncbn_model(net) + if runner.configer.get('gpu') is not None: net = RunnerHelper._make_parallel(runner, net) - net = net.to(torch.device('cpu' if runner.configer.get('gpu') is None else 'cuda')) if model_path is not None or runner.configer.get('network', 'resume') is not None: resume_path = runner.configer.get('network', 'resume') resume_path = model_path if model_path is not None else resume_path diff --git a/scripts/seg/cityscapes/run_fs_deeplabv3_cityscapes_seg.sh b/scripts/seg/cityscapes/run_fs_deeplabv3_cityscapes_seg.sh index 9da7b49..707be3c 100644 --- a/scripts/seg/cityscapes/run_fs_deeplabv3_cityscapes_seg.sh +++ b/scripts/seg/cityscapes/run_fs_deeplabv3_cityscapes_seg.sh @@ -27,16 +27,17 @@ if [[ ! -d ${LOG_DIR} ]]; then mkdir -p ${LOG_DIR} fi +NGPUS=4 if [[ "$1"x == "train"x ]]; then - ${PYTHON} -u main.py --config_file ${CONFIG_FILE} --drop_last y --phase train --gather n \ - --backbone ${BACKBONE} --model_name ${MODEL_NAME} --gpu 0 1 2 3 \ + ${PYTHON} -m torch.distributed.launch --nproc_per_node=${NGPUS} main.py --config_file ${CONFIG_FILE} --phase train --train_batch_size 1 --val_batch_size 1 \ + --backbone ${BACKBONE} --model_name ${MODEL_NAME} --drop_last y --syncbn y \ --data_dir ${DATA_DIR} --loss_type ${LOSS_TYPE} --max_iters ${MAX_ITERS} \ --checkpoints_name ${CHECKPOINTS_NAME} --pretrained ${PRETRAINED_MODEL} 2>&1 | tee ${LOG_FILE} elif [[ "$1"x == "resume"x ]]; then ${PYTHON} -u main.py --config_file ${CONFIG_FILE} --drop_last y --phase train --gather n \ - --backbone ${BACKBONE} --model_name ${MODEL_NAME} --gpu 0 1 2 3 \ + --backbone ${BACKBONE} --model_name ${MODEL_NAME} --gpu None \ --data_dir ${DATA_DIR} --loss_type ${LOSS_TYPE} --max_iters ${MAX_ITERS} \ --resume_continue y --resume ./checkpoints/seg/cityscapes/${CHECKPOINTS_NAME}_latest.pth \ --checkpoints_name ${CHECKPOINTS_NAME} --pretrained ${PRETRAINED_MODEL} 2>&1 | tee -a ${LOG_FILE} @@ -46,7 +47,7 @@ elif [[ "$1"x == "val"x ]]; then --backbone ${BACKBONE} --model_name ${MODEL_NAME} --checkpoints_name ${CHECKPOINTS_NAME} \ --resume ./checkpoints/seg/cityscapes/${CHECKPOINTS_NAME}_latest.pth \ --test_dir ${DATA_DIR}/val/image --out_dir val 2>&1 | tee -a ${LOG_FILE} - cd metrics/seg/ + cd metric/seg/ ${PYTHON} -u cityscapes_evaluator.py --pred_dir ../../results/seg/cityscapes/${CHECKPOINTS_NAME}/val/label \ --gt_dir ${DATA_DIR}/val/label 2>&1 | tee -a "../../"${LOG_FILE} diff --git a/scripts/seg/cityscapes/run_fs_pspnet_cityscapes_seg.sh b/scripts/seg/cityscapes/run_fs_pspnet_cityscapes_seg.sh index e50f5b2..a423394 100644 --- a/scripts/seg/cityscapes/run_fs_pspnet_cityscapes_seg.sh +++ b/scripts/seg/cityscapes/run_fs_pspnet_cityscapes_seg.sh @@ -29,7 +29,7 @@ fi if [[ "$1"x == "train"x ]]; then - ${PYTHON} -u main.py --config_file ${CONFIG_FILE} --drop_last y --phase train --gather n \ + ${PYTHON} -u main.py --config_file ${CONFIG_FILE} --drop_last y --phase train --gather n --workers 16 \ --backbone ${BACKBONE} --model_name ${MODEL_NAME} --gpu 0 1 2 3 \ --data_dir ${DATA_DIR} --loss_type ${LOSS_TYPE} --max_iters ${MAX_ITERS} \ --checkpoints_name ${CHECKPOINTS_NAME} --pretrained ${PRETRAINED_MODEL} 2>&1 | tee ${LOG_FILE}