Skip to content

Commit

Permalink
fix seg testing.
Browse files Browse the repository at this point in the history
  • Loading branch information
donnyyou committed Jul 29, 2019
1 parent 7dfb954 commit 3967fb6
Show file tree
Hide file tree
Showing 15 changed files with 79 additions and 64 deletions.
2 changes: 1 addition & 1 deletion configs/seg/cityscapes/fs_deeplabv3_cityscapes_seg.conf
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"ms_test":{
"scale_search": [0.75, 1.0, 1.25]
},
"mode": "mscrop_test"
"mode": "ss_test"
},
"details": {
"color_list": [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], [190, 153, 153],
Expand Down
32 changes: 21 additions & 11 deletions datasets/seg/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Class for the Semantic Segmentation Data Loader.


import torch
from torch.utils import data

from datasets.seg.loader.default_loader import DefaultLoader
Expand Down Expand Up @@ -45,13 +46,18 @@ def __init__(self, configer):

def get_trainloader(self):
if self.configer.get('train.loader', default=None) in [None, 'default']:
trainloader = data.DataLoader(
DefaultLoader(root_dir=self.configer.get('data', 'data_dir'), dataset='train',
dataset = DefaultLoader(root_dir=self.configer.get('data', 'data_dir'), dataset='train',
aug_transform=self.aug_train_transform,
img_transform=self.img_transform,
label_transform=self.label_transform,
configer=self.configer),
batch_size=self.configer.get('train', 'batch_size'), shuffle=True,
configer=self.configer)
sampler = None
if self.configer.get('network.distributed'):
sampler = torch.utils.data.distributed.DistributedSampler(dataset)

trainloader = data.DataLoader(
dataset, sampler=sampler,
batch_size=self.configer.get('train', 'batch_size'), shuffle=(sampler is None),
num_workers=self.configer.get('data', 'workers'), pin_memory=True,
drop_last=self.configer.get('data', 'drop_last'),
collate_fn=lambda *args: collate(
Expand All @@ -65,15 +71,19 @@ def get_trainloader(self):
Log.error('{} train loader is invalid.'.format(self.configer.get('train', 'loader')))
exit(1)

def get_valloader(self, dataset=None):
dataset = 'val' if dataset is None else dataset
def get_valloader(self):
if self.configer.get('val.loader', default=None) in [None, 'default']:
dataset = DefaultLoader(root_dir=self.configer.get('data', 'data_dir'), dataset='val',
aug_transform=self.aug_val_transform,
img_transform=self.img_transform,
label_transform=self.label_transform,
configer=self.configer)
sampler = None
if self.configer.get('network.distributed'):
sampler = torch.utils.data.distributed.DistributedSampler(dataset)

valloader = data.DataLoader(
DefaultLoader(root_dir=self.configer.get('data', 'data_dir'), dataset=dataset,
aug_transform=self.aug_val_transform,
img_transform=self.img_transform,
label_transform=self.label_transform,
configer=self.configer),
dataset, sampler=sampler,
batch_size=self.configer.get('val', 'batch_size'), shuffle=False,
num_workers=self.configer.get('data', 'workers'), pin_memory=True,
collate_fn=lambda *args: collate(
Expand Down
2 changes: 1 addition & 1 deletion datasets/test/test_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, configer):
Normalize(**self.configer.get('data', 'normalize')), ])

def get_testloader(self, test_dir=None, list_path=None, json_path=None):
if not self.configer.exists('test', 'loader') or self.configer.get('test', 'loader') == 'default':
if self.configer.get('test.loader', default=None) in [None, 'default']:
test_dir = test_dir if test_dir is not None else self.configer.get('test', 'test_dir')
testloader = data.DataLoader(
DefaultLoader(test_dir=test_dir,
Expand Down
23 changes: 14 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def str2bool(v):
dest='network.backbone', help='The base network of model.')
parser.add_argument('--norm_type', default=None, type=str,
dest='network.norm_type', help='The BN type of the network.')
parser.add_argument('--syncbn', type=str2bool, nargs='?', default=False,
dest='network.syncbn', help='Whether to sync BN.')
parser.add_argument('--pretrained', type=str, default=None,
dest='network.pretrained', help='The path to pretrained model.')
parser.add_argument('--resume', default=None, type=str,
Expand All @@ -79,6 +81,8 @@ def str2bool(v):
dest='network.resume_val', help='Whether to validate during resume.')
parser.add_argument('--gather', type=str2bool, nargs='?', default=True,
dest='network.gather', help='Whether to gather the output of model.')
parser.add_argument('--distributed', type=str2bool, nargs='?', default=False,
dest='network.distributed', help='Whether to gather the output of model.')

# *********** Params for solver. **********
parser.add_argument('--optim_method', default=None, type=str,
Expand Down Expand Up @@ -127,27 +131,28 @@ def str2bool(v):
# *********** Params for env. **********
parser.add_argument('--seed', default=None, type=int, help='manual seed')
parser.add_argument('--cudnn', type=str2bool, nargs='?', default=True, help='Use CUDNN.')
parser.add_argument("--local_rank", default=0, type=int)

args_parser = parser.parse_args()
args = parser.parse_args()
configer = Configer(args_parser=args)

if args_parser.seed is not None:
random.seed(args_parser.seed)
torch.manual_seed(args_parser.seed)
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)

cudnn.enabled = True
cudnn.benchmark = args_parser.cudnn
cudnn.benchmark = args.cudnn

configer = Configer(args_parser=args_parser)
abs_data_dir = os.path.expanduser(configer.get('data', 'data_dir'))
configer.update('data.data_dir', abs_data_dir)

if configer.get('gpu') is not None:
if configer.get('gpu') is not None and not configer.get('network.distributed', default=False):
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(gpu_id) for gpu_id in configer.get('gpu'))

if configer.get('network', 'norm_type') is None:
configer.update('network.norm_type', 'batchnorm')

if len(configer.get('gpu')) == 1 or len(range(torch.cuda.device_count())) == 1:
if torch.cuda.device_count() <= 1 or configer.get('network.distributed', default=False):
configer.update('network.gather', True)

if configer.get('phase') == 'train':
Expand All @@ -171,6 +176,7 @@ def str2bool(v):

Log.info('BN Type is {}.'.format(configer.get('network', 'norm_type')))
Log.info('Config Dict: {}'.format(json.dumps(configer.to_dict(), indent=2)))

runner_selector = RunnerSelector(configer)
runner = None
if configer.get('task') == 'pose':
Expand All @@ -186,7 +192,6 @@ def str2bool(v):
else:
Log.error('Task: {} is not valid.'.format(configer.get('task')))
exit(1)

if configer.get('phase') == 'train':
if configer.get('network', 'resume') is None:
Controller.init(runner)
Expand Down
3 changes: 3 additions & 0 deletions model/seg/nets/deeplabv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ def forward(self, data_dict):
x = F.interpolate(x, size=(data_dict['img'].size(2), data_dict['img'].size(3)),
mode="bilinear", align_corners=True)
out_dict = dict(dsn_out=x_dsn, out=x)
if self.configer.get('phase') == 'test':
return out_dict

loss_dict = dict()
if 'dsn_ce_loss' in self.valid_loss_dict:
loss_dict['dsn_ce_loss'] = dict(
Expand Down
3 changes: 3 additions & 0 deletions model/seg/nets/denseassp.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def forward(self, data_dict):
x = F.interpolate(x, size=(data_dict['img'].size(2), data_dict['img'].size(3)),
mode="bilinear", align_corners=True)
out_dict = dict(out=x)
if self.configer.get('phase') == 'test':
return out_dict

loss_dict = dict()
if 'ce_loss' in self.valid_loss_dict:
loss_dict['ce_loss'] = dict(
Expand Down
3 changes: 3 additions & 0 deletions model/seg/nets/pspnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ def forward(self, data_dict):
x = F.interpolate(x, size=(data_dict['img'].size(2), data_dict['img'].size(3)),
mode="bilinear", align_corners=True)
out_dict = dict(dsn_out=x_dsn, out=x)
if self.configer.get('phase') == 'test':
return out_dict

loss_dict = dict()
if 'dsn_ce_loss' in self.valid_loss_dict:
loss_dict['dsn_ce_loss'] = dict(
Expand Down
28 changes: 3 additions & 25 deletions model/tools/module_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@ def BNReLU(num_features, norm_type=None, **kwargs):
nn.BatchNorm2d(num_features, **kwargs),
nn.ReLU()
)
elif norm_type == 'sync_batchnorm':
from exts.ops.sync_bn.syncbn import BatchNorm2d
return nn.Sequential(
BatchNorm2d(num_features, **kwargs),
nn.ReLU()
)
elif norm_type == 'encsync_batchnorm':
from encoding.nn import BatchNorm2d
return nn.Sequential(
Expand All @@ -41,9 +35,6 @@ def BNReLU(num_features, norm_type=None, **kwargs):
nn.InstanceNorm2d(num_features, **kwargs),
nn.ReLU()
)
# elif bn_type == 'inplace_abn':
# from extensions.ops.inplace_abn.bn import InPlaceABNSync
# return InPlaceABNSync(num_features, **kwargs)
else:
Log.error('Not support BN type: {}.'.format(norm_type))
exit(1)
Expand All @@ -53,10 +44,6 @@ def BatchNorm3d(norm_type=None, ret_cls=False):
if norm_type == 'batchnorm':
return nn.BatchNorm3d

elif norm_type == 'sync_batchnorm':
from exts.ops.sync_bn.syncbn import BatchNorm3d
return BatchNorm3d

elif norm_type == 'encsync_batchnorm':
from encoding.nn import BatchNorm3d
return BatchNorm3d
Expand All @@ -79,10 +66,6 @@ def BatchNorm2d(norm_type=None, ret_cls=False):
if norm_type == 'batchnorm':
return nn.BatchNorm2d

elif norm_type == 'sync_batchnorm':
from exts.ops.sync_bn.syncbn import BatchNorm2d
return BatchNorm2d

elif norm_type == 'encsync_batchnorm':
from encoding.nn import BatchNorm2d
return BatchNorm2d
Expand All @@ -105,10 +88,6 @@ def BatchNorm1d(norm_type=None, ret_cls=False):
if norm_type == 'batchnorm':
return nn.BatchNorm1d

elif norm_type == 'sync_batchnorm':
from exts.ops.sync_bn.syncbn import BatchNorm1d
return BatchNorm1d

elif norm_type == 'encsync_batchnorm':
from encoding.nn import BatchNorm1d
return BatchNorm1d
Expand All @@ -127,7 +106,7 @@ def BatchNorm1d(norm_type=None, ret_cls=False):
exit(1)

@staticmethod
def load_model(model, pretrained=None, all_match=True):
def load_model(model, pretrained=None, all_match=True, map_location='cpu'):
if pretrained is None:
return model

Expand All @@ -137,7 +116,7 @@ def load_model(model, pretrained=None, all_match=True):

Log.info('Loading pretrained model:{}'.format(pretrained))
if all_match:
pretrained_dict = torch.load(pretrained)
pretrained_dict = torch.load(pretrained, map_location=map_location)
model_dict = model.state_dict()
load_dict = dict()
for k, v in pretrained_dict.items():
Expand All @@ -146,7 +125,6 @@ def load_model(model, pretrained=None, all_match=True):
else:
load_dict[k] = v

# load_dict = {k: v for k, v in pretrained_dict.items() if 'resinit.{}'.format(k) not in model_dict}
model.load_state_dict(load_dict)

else:
Expand All @@ -161,7 +139,7 @@ def load_model(model, pretrained=None, all_match=True):

@staticmethod
def load_url(url, map_location=None):
model_dir = os.path.join('~', '.PyTorchCV', 'model')
model_dir = os.path.join('~', '.TorchCV', 'model')
if not os.path.exists(model_dir):
os.makedirs(model_dir)

Expand Down
3 changes: 2 additions & 1 deletion runner/seg/fcn_segmentor.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def train(self):
self.data_time.update(time.time() - start_time)

# Forward pass.
data_dict = RunnerHelper.to_device(self, data_dict)
out = self.seg_net(data_dict)
# Compute the loss of the train batch & backward.
loss_dict = self.loss(out)
Expand Down Expand Up @@ -132,7 +133,7 @@ def val(self, data_loader=None):

data_loader = self.val_loader if data_loader is None else data_loader
for j, data_dict in enumerate(data_loader):

data_dict = RunnerHelper.to_device(self, data_dict)
with torch.no_grad():
# Forward pass.
out = self.seg_net(data_dict)
Expand Down
4 changes: 2 additions & 2 deletions runner/seg/fcn_segmentor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ def test(self, test_dir, out_dir):
ImageHelper.save(image_canvas,
save_path=os.path.join(out_dir, 'vis/{}.png'.format(meta_list[i]['filename'])))

if self.configer.exists('data', 'label_list'):
if self.configer.get('data.label_list', default=None) is not None:
label_img = self.__relabel(label_img)

if self.configer.exists('data', 'reduce_zero_label') and self.configer.get('data', 'reduce_zero_label'):
if self.configer.get('data.reduce_zero_label', default=False):
label_img = label_img + 1
label_img = label_img.astype(np.uint8)

Expand Down
2 changes: 1 addition & 1 deletion runner/tools/blob_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_blob(self, data_dict, scale=None, flip=False):
if flip:
image = image.flip([2])

if self.configer.exists('test', 'fit_stride'):
if self.configer.get('test.fit_stride', default=0) > 0:
stride = self.configer.get('test', 'fit_stride')

pad_w = 0 if (border_hw[1] % stride == 0) else stride - (border_hw[1] % stride) # right
Expand Down
6 changes: 6 additions & 0 deletions runner/tools/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,18 @@ def train(runner):

if runner.configer.get('solver', 'lr')['metric'] == 'epoch':
while runner.runner_state['epoch'] < runner.configer.get('solver', 'max_epoch'):
if runner.configer.get('network.distributed'):
runner.train_loader.sampler.set_epoch(runner.runner_state['epoch'])

runner.train()
if runner.runner_state['epoch'] == runner.configer.get('solver', 'max_epoch'):
runner.val()
break
else:
while runner.runner_state['iters'] < runner.configer.get('solver', 'max_iters'):
if runner.configer.get('network.distributed'):
runner.train_loader.sampler.set_epoch(runner.runner_state['epoch'])

runner.train()
if runner.runner_state['iters'] == runner.configer.get('solver', 'max_iters'):
runner.val()
Expand Down
21 changes: 13 additions & 8 deletions runner/tools/runner_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,27 @@ def to_device(runner, in_data):

@staticmethod
def _make_parallel(runner, net):
parallel_type = runner.configer.get('network.parallel', default='dp')
if parallel_type == 'dp':
if runner.configer.get('network.distributed', default=False):
from apex.parallel import DistributedDataParallel
torch.cuda.set_device(runner.configer.get('local_rank'))
torch.distributed.init_process_group(backend='nccl', init_method='env://')
net = DistributedDataParallel(net.cuda(), delay_allreduce=True)
return net
else:
net = net.to(torch.device('cpu' if runner.configer.get('gpu') is None else 'cuda'))
from exts.tools.parallel.data_parallel import ParallelModel
return ParallelModel(net, gather_=runner.configer.get('network', 'gather'))
elif parallel_type == 'ddp':
from exts.tools.parallel.data_parallel import DistributeParallelModel
return DistributeParallelModel(net, gather_=runner.configer.get('network', 'gather'))
else:
raise ValueError('Not support DataParallel: {}'.format(parallel_type))

@staticmethod
def load_net(runner, net, model_path=None):
if runner.configer.get('network.syncbn', default=False):
Log.info('Converting syncbn model...')
from apex.parallel import convert_syncbn_model
net = convert_syncbn_model(net)

if runner.configer.get('gpu') is not None:
net = RunnerHelper._make_parallel(runner, net)

net = net.to(torch.device('cpu' if runner.configer.get('gpu') is None else 'cuda'))
if model_path is not None or runner.configer.get('network', 'resume') is not None:
resume_path = runner.configer.get('network', 'resume')
resume_path = model_path if model_path is not None else resume_path
Expand Down
9 changes: 5 additions & 4 deletions scripts/seg/cityscapes/run_fs_deeplabv3_cityscapes_seg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,17 @@ if [[ ! -d ${LOG_DIR} ]]; then
mkdir -p ${LOG_DIR}
fi

NGPUS=4

if [[ "$1"x == "train"x ]]; then
${PYTHON} -u main.py --config_file ${CONFIG_FILE} --drop_last y --phase train --gather n \
--backbone ${BACKBONE} --model_name ${MODEL_NAME} --gpu 0 1 2 3 \
${PYTHON} -m torch.distributed.launch --nproc_per_node=${NGPUS} main.py --config_file ${CONFIG_FILE} --phase train --train_batch_size 1 --val_batch_size 1 \
--backbone ${BACKBONE} --model_name ${MODEL_NAME} --drop_last y --syncbn y \
--data_dir ${DATA_DIR} --loss_type ${LOSS_TYPE} --max_iters ${MAX_ITERS} \
--checkpoints_name ${CHECKPOINTS_NAME} --pretrained ${PRETRAINED_MODEL} 2>&1 | tee ${LOG_FILE}

elif [[ "$1"x == "resume"x ]]; then
${PYTHON} -u main.py --config_file ${CONFIG_FILE} --drop_last y --phase train --gather n \
--backbone ${BACKBONE} --model_name ${MODEL_NAME} --gpu 0 1 2 3 \
--backbone ${BACKBONE} --model_name ${MODEL_NAME} --gpu None \
--data_dir ${DATA_DIR} --loss_type ${LOSS_TYPE} --max_iters ${MAX_ITERS} \
--resume_continue y --resume ./checkpoints/seg/cityscapes/${CHECKPOINTS_NAME}_latest.pth \
--checkpoints_name ${CHECKPOINTS_NAME} --pretrained ${PRETRAINED_MODEL} 2>&1 | tee -a ${LOG_FILE}
Expand All @@ -46,7 +47,7 @@ elif [[ "$1"x == "val"x ]]; then
--backbone ${BACKBONE} --model_name ${MODEL_NAME} --checkpoints_name ${CHECKPOINTS_NAME} \
--resume ./checkpoints/seg/cityscapes/${CHECKPOINTS_NAME}_latest.pth \
--test_dir ${DATA_DIR}/val/image --out_dir val 2>&1 | tee -a ${LOG_FILE}
cd metrics/seg/
cd metric/seg/
${PYTHON} -u cityscapes_evaluator.py --pred_dir ../../results/seg/cityscapes/${CHECKPOINTS_NAME}/val/label \
--gt_dir ${DATA_DIR}/val/label 2>&1 | tee -a "../../"${LOG_FILE}

Expand Down
Loading

0 comments on commit 3967fb6

Please sign in to comment.