forked from open-mmlab/mmdetection
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Enhance] Mask2Former Instance Segm Only (open-mmlab#7571)
* Mask2Former/MaskFormer instance only training/eval * obsolete config names * if cond is None fix * white space * fix tests * yapf formatting fix * semantic_seg None docstring * original config names * pan/ins unit test * show_result comment * pan/ins head unit test * redundant test * inherit configs * correct gpu # * revert version * BaseDetector.show_result comment * revert more versions * clarify comment * clarify comment * add FilterAnnotations to data pipeline * more complete Returns docstring * use pytest.mark.parametrize decorator * fix docstring formatting * lint * Include instances passing mask area test * Make FilterAnnotations generic for masks or bboxes * Duplicate assertion * Add pad config * Less hard coded padding setting * Clarify test arguments * Additional inst_seg configs * delete configs * Include original dev branch configs * Fix indent * fix lint error from merge conflict * Update .pre-commit-config.yaml * Rename mask2former_r50_lsj_8x2_50e_coco.py to mask2former_r50_lsj_8x2_50e_coco-panoptic.py * Update and rename mask2former_r101_lsj_8x2_50e_coco.py to mask2former_r101_lsj_8x2_50e_coco-panoptic.py * Update and rename mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py to mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py * Update and rename mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py to mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py * Update and rename mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py to mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py * Update and rename mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py to mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py * Update and rename mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py to mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py * Create mask2former_r50_lsj_8x2_50e_coco.py * Create mask2former_r101_lsj_8x2_50e_coco.py * Create mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py * Create mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py * Update test_forward.py * remove gt_sem_seg Co-authored-by: Cedric Luo <[email protected]>
- Loading branch information
Showing
19 changed files
with
680 additions
and
367 deletions.
There are no files selected for viewing
7 changes: 7 additions & 0 deletions
7
configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
_base_ = './mask2former_r50_lsj_8x2_50e_coco-panoptic.py' | ||
|
||
model = dict( | ||
backbone=dict( | ||
depth=101, | ||
init_cfg=dict(type='Pretrained', | ||
checkpoint='torchvision://resnet101'))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
253 changes: 253 additions & 0 deletions
253
configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,253 @@ | ||
_base_ = [ | ||
'../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' | ||
] | ||
num_things_classes = 80 | ||
num_stuff_classes = 53 | ||
num_classes = num_things_classes + num_stuff_classes | ||
model = dict( | ||
type='Mask2Former', | ||
backbone=dict( | ||
type='ResNet', | ||
depth=50, | ||
num_stages=4, | ||
out_indices=(0, 1, 2, 3), | ||
frozen_stages=-1, | ||
norm_cfg=dict(type='BN', requires_grad=False), | ||
norm_eval=True, | ||
style='pytorch', | ||
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), | ||
panoptic_head=dict( | ||
type='Mask2FormerHead', | ||
in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside | ||
strides=[4, 8, 16, 32], | ||
feat_channels=256, | ||
out_channels=256, | ||
num_things_classes=num_things_classes, | ||
num_stuff_classes=num_stuff_classes, | ||
num_queries=100, | ||
num_transformer_feat_level=3, | ||
pixel_decoder=dict( | ||
type='MSDeformAttnPixelDecoder', | ||
num_outs=3, | ||
norm_cfg=dict(type='GN', num_groups=32), | ||
act_cfg=dict(type='ReLU'), | ||
encoder=dict( | ||
type='DetrTransformerEncoder', | ||
num_layers=6, | ||
transformerlayers=dict( | ||
type='BaseTransformerLayer', | ||
attn_cfgs=dict( | ||
type='MultiScaleDeformableAttention', | ||
embed_dims=256, | ||
num_heads=8, | ||
num_levels=3, | ||
num_points=4, | ||
im2col_step=64, | ||
dropout=0.0, | ||
batch_first=False, | ||
norm_cfg=None, | ||
init_cfg=None), | ||
ffn_cfgs=dict( | ||
type='FFN', | ||
embed_dims=256, | ||
feedforward_channels=1024, | ||
num_fcs=2, | ||
ffn_drop=0.0, | ||
act_cfg=dict(type='ReLU', inplace=True)), | ||
operation_order=('self_attn', 'norm', 'ffn', 'norm')), | ||
init_cfg=None), | ||
positional_encoding=dict( | ||
type='SinePositionalEncoding', num_feats=128, normalize=True), | ||
init_cfg=None), | ||
enforce_decoder_input_project=False, | ||
positional_encoding=dict( | ||
type='SinePositionalEncoding', num_feats=128, normalize=True), | ||
transformer_decoder=dict( | ||
type='DetrTransformerDecoder', | ||
return_intermediate=True, | ||
num_layers=9, | ||
transformerlayers=dict( | ||
type='DetrTransformerDecoderLayer', | ||
attn_cfgs=dict( | ||
type='MultiheadAttention', | ||
embed_dims=256, | ||
num_heads=8, | ||
attn_drop=0.0, | ||
proj_drop=0.0, | ||
dropout_layer=None, | ||
batch_first=False), | ||
ffn_cfgs=dict( | ||
embed_dims=256, | ||
feedforward_channels=2048, | ||
num_fcs=2, | ||
act_cfg=dict(type='ReLU', inplace=True), | ||
ffn_drop=0.0, | ||
dropout_layer=None, | ||
add_identity=True), | ||
feedforward_channels=2048, | ||
operation_order=('cross_attn', 'norm', 'self_attn', 'norm', | ||
'ffn', 'norm')), | ||
init_cfg=None), | ||
loss_cls=dict( | ||
type='CrossEntropyLoss', | ||
use_sigmoid=False, | ||
loss_weight=2.0, | ||
reduction='mean', | ||
class_weight=[1.0] * num_classes + [0.1]), | ||
loss_mask=dict( | ||
type='CrossEntropyLoss', | ||
use_sigmoid=True, | ||
reduction='mean', | ||
loss_weight=5.0), | ||
loss_dice=dict( | ||
type='DiceLoss', | ||
use_sigmoid=True, | ||
activate=True, | ||
reduction='mean', | ||
naive_dice=True, | ||
eps=1.0, | ||
loss_weight=5.0)), | ||
panoptic_fusion_head=dict( | ||
type='MaskFormerFusionHead', | ||
num_things_classes=num_things_classes, | ||
num_stuff_classes=num_stuff_classes, | ||
loss_panoptic=None, | ||
init_cfg=None), | ||
train_cfg=dict( | ||
num_points=12544, | ||
oversample_ratio=3.0, | ||
importance_sample_ratio=0.75, | ||
assigner=dict( | ||
type='MaskHungarianAssigner', | ||
cls_cost=dict(type='ClassificationCost', weight=2.0), | ||
mask_cost=dict( | ||
type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), | ||
dice_cost=dict( | ||
type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), | ||
sampler=dict(type='MaskPseudoSampler')), | ||
test_cfg=dict( | ||
panoptic_on=True, | ||
# For now, the dataset does not support | ||
# evaluating semantic segmentation metric. | ||
semantic_on=False, | ||
instance_on=True, | ||
# max_per_image is for instance segmentation. | ||
max_per_image=100, | ||
iou_thr=0.8, | ||
# In Mask2Former's panoptic postprocessing, | ||
# it will filter mask area where score is less than 0.5 . | ||
filter_low_score=True), | ||
init_cfg=None) | ||
|
||
# dataset settings | ||
image_size = (1024, 1024) | ||
img_norm_cfg = dict( | ||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) | ||
train_pipeline = [ | ||
dict(type='LoadImageFromFile', to_float32=True), | ||
dict( | ||
type='LoadPanopticAnnotations', | ||
with_bbox=True, | ||
with_mask=True, | ||
with_seg=True), | ||
dict(type='RandomFlip', flip_ratio=0.5), | ||
# large scale jittering | ||
dict( | ||
type='Resize', | ||
img_scale=image_size, | ||
ratio_range=(0.1, 2.0), | ||
multiscale_mode='range', | ||
keep_ratio=True), | ||
dict( | ||
type='RandomCrop', | ||
crop_size=image_size, | ||
crop_type='absolute', | ||
recompute_bbox=True, | ||
allow_negative_crop=True), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='Pad', size=image_size), | ||
dict(type='DefaultFormatBundle', img_to_float=True), | ||
dict( | ||
type='Collect', | ||
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), | ||
] | ||
test_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict( | ||
type='MultiScaleFlipAug', | ||
img_scale=(1333, 800), | ||
flip=False, | ||
transforms=[ | ||
dict(type='Resize', keep_ratio=True), | ||
dict(type='RandomFlip'), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='Pad', size_divisor=32), | ||
dict(type='ImageToTensor', keys=['img']), | ||
dict(type='Collect', keys=['img']), | ||
]) | ||
] | ||
data_root = 'data/coco/' | ||
data = dict( | ||
samples_per_gpu=2, | ||
workers_per_gpu=2, | ||
train=dict(pipeline=train_pipeline), | ||
val=dict( | ||
pipeline=test_pipeline, | ||
ins_ann_file=data_root + 'annotations/instances_val2017.json', | ||
), | ||
test=dict( | ||
pipeline=test_pipeline, | ||
ins_ann_file=data_root + 'annotations/instances_val2017.json', | ||
)) | ||
|
||
embed_multi = dict(lr_mult=1.0, decay_mult=0.0) | ||
# optimizer | ||
optimizer = dict( | ||
type='AdamW', | ||
lr=0.0001, | ||
weight_decay=0.05, | ||
eps=1e-8, | ||
betas=(0.9, 0.999), | ||
paramwise_cfg=dict( | ||
custom_keys={ | ||
'backbone': dict(lr_mult=0.1, decay_mult=1.0), | ||
'query_embed': embed_multi, | ||
'query_feat': embed_multi, | ||
'level_embed': embed_multi, | ||
}, | ||
norm_decay_mult=0.0)) | ||
optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) | ||
|
||
# learning policy | ||
lr_config = dict( | ||
policy='step', | ||
gamma=0.1, | ||
by_epoch=False, | ||
step=[327778, 355092], | ||
warmup='linear', | ||
warmup_by_epoch=False, | ||
warmup_ratio=1.0, # no warmup | ||
warmup_iters=10) | ||
|
||
max_iters = 368750 | ||
runner = dict(type='IterBasedRunner', max_iters=max_iters) | ||
|
||
log_config = dict( | ||
interval=50, | ||
hooks=[ | ||
dict(type='TextLoggerHook', by_epoch=False), | ||
dict(type='TensorboardLoggerHook', by_epoch=False) | ||
]) | ||
interval = 5000 | ||
workflow = [('train', interval)] | ||
checkpoint_config = dict( | ||
by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) | ||
|
||
# Before 365001th iteration, we do evaluation every 5000 iterations. | ||
# After 365000th iteration, we do evaluation every 368750 iterations, | ||
# which means that we do evaluation at the end of training. | ||
dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] | ||
evaluation = dict( | ||
interval=interval, | ||
dynamic_intervals=dynamic_intervals, | ||
metric=['PQ', 'bbox', 'segm']) |
Oops, something went wrong.