Modify the benchmark regression script (open-mmlab#885)

* add default test and train args * remove partition field from default args to the parser * fix typo in config * fix typo in model list
wjkim81 · Aug 31, 2021 · 0839f84 · 0839f84
1 parent 5f3c176
commit 0839f84
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 125 deletions.
diff --git a/.dev_scripts/benchmark/benchmark_cfg.yaml b/.dev_scripts/benchmark/benchmark_cfg.yaml
@@ -24,7 +24,7 @@ model_list:
       checkpoint: https://download.openmmlab.com/mmpose/top_down/rsn/rsn18_coco_256x192-72f4b4a7_20201127.pth
     # ViPNAS
     ## ViPNAS + COCO
-    - config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/s_vipnas_res50_coco_256x192.py
+    - config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py
       checkpoint: https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth
     # HRNetV2
     ## HRNetV2 + AFLW
@@ -70,7 +70,7 @@ model_list:
     # CPM
     ## CPM + COCO
     - config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco_256x192.py
-      checkpoint: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192-aa4ba095_20200817.
+      checkpoint: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192-aa4ba095_20200817.pth
     ## CPM + JHMDB
     - config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb_sub1_368x368.py
       checkpoint: https://download.openmmlab.com/mmpose/top_down/cpm/cpm_jhmdb_sub1_368x368-2d2585c9_20201122.pth

diff --git a/.dev_scripts/benchmark/benchmark_regression.py b/.dev_scripts/benchmark/benchmark_regression.py
@@ -9,10 +9,22 @@
 
 import mmcv
 
+DEFAULT_TEST_ARGS = dict(
+    gpus=1,
+    gpus_per_node=1,
+    cpus_per_task=5,
+)
+
+DEFAULT_TRAIN_ARGS = dict(
+    gpus=8,
+    gpus_per_node=8,
+    cpus_per_task=5,
+)
+
 
 def is_port_available(port, host='127.0.0.1'):
-    """check whether a port is in use return True if the port is available else
-    False."""
+    """check whether a port is in use, return True if the port is available
+    else False."""
     s = None
     try:
         s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -29,41 +41,48 @@ def is_port_available(port, host='127.0.0.1'):
 def parse_args():
     parser = argparse.ArgumentParser(
         description='running benchmark regression with tmux')
+    parser.add_argument(
+        '--partition',
+        '-p',
+        help='models with priority higher or equal to this will be included')
+
     parser.add_argument(
         '--config',
+        '-c',
         help='test config file path',
-        default='./.dev_scripts/benchmark/benchmark_regression_cfg.yaml')
+        default='./.dev_scripts/benchmark/benchmark_cfg.yaml')
+    parser.add_argument(
+        '--mode',
+        help='the benchmark regression mode, can be "test" or "train"',
+        default='test')
+
     parser.add_argument(
         '--priority',
-        nargs=2,
         type=int,
-        help='largest priority for test and train tasks respectively',
-        default=[3, 3])
+        help='models with priority higher or equal to this will be included',
+        default=2)
 
     # runtime setting parameters
     parser.add_argument(
-        '--root-work-dir', help='the root working directory to store logs')
+        '--root-work-dir',
+        '-r',
+        help='the root working directory to store logs')
     parser.add_argument(
-        '--session-name', '-s', help='the tmux session name', default='test')
+        '--session-name',
+        '-s',
+        help='the tmux session name',
+        default='benchmark_regression')
     parser.add_argument(
         '--panes-per-window',
+        '-w',
         type=int,
         help='the maximum number of panes in each tmux window',
         default=12)
     parser.add_argument(
         '--env',
+        '-e',
         help='the conda environment used to run the tasks',
         default='pt1.6')
-    parser.add_argument(
-        '--partition', help='the partition name', default='mm_human')
-    parser.add_argument('--gpus', help='the total number of GPUs', default=8)
-    parser.add_argument(
-        '--gpus-per-node',
-        default=8,
-        help='the number of GPUs used per computing node',
-        choices=[1, 2, 3, 4, 5, 6, 7, 8])
-    parser.add_argument(
-        '--cpus-per-task', default=5, help='the number of CPUs used per task')
 
     args = parser.parse_args()
     return args
@@ -76,22 +95,15 @@ def main():
         # get the current time stamp
         now = datetime.now()
         ts = now.strftime('%Y_%m_%d_%H_%M')
-        args.root_work_dir = f'work_dirs/benchmark_regression_{ts}'
+        args.root_work_dir = f'work_dirs/benchmark_regression_{args.mode}_{ts}'
     mmcv.mkdir_or_exist(osp.abspath(args.root_work_dir))
 
     cfg = mmcv.load(args.config)
 
-    # priority for test and train tasks respectively
-    prio_test, prio_train = args.priority
-    prio = max(prio_test, prio_train) + 1
-
-    # number of benchmark regression tasks
+    # number of tasks
     num_task = 0
-    for i in range(prio):
-        if i <= prio_test:
-            num_task += len(cfg['model_list'][f'P{i}'])
-        if i <= prio_train:
-            num_task += len(cfg['model_list'][f'P{i}'])
+    for i in range(args.priority + 1):
+        num_task += len(cfg['model_list'][f'P{i}'])
 
     # number of windows need to be created
     num_win = math.ceil(num_task / args.panes_per_window)
@@ -123,30 +135,34 @@ def main():
             os.system(f'tmux split-window -h -p {ratio}')
         os.system('tmux select-layout tiled')
 
-    # the initial number of task
-    cur_task = 1
-
     # get the hostname
     hostname = socket.gethostname()
-    print('hostname: ', hostname)
+    print('Hostname: ', hostname)
     # get the host ip
     ip = socket.gethostbyname(hostname)
-    print('ip: ', ip)
+    print('IP: ', ip)
 
+    # the initial number of task
+    cur_task = 1
     # initialize a starting port
     cur_port = 29500
 
-    for i in range(prio):
+    for i in range(args.priority + 1):
         models = cfg['model_list'][f'P{i}']
 
-        # modes = ['test','train']
-        modes = []
-        if i <= prio_test:
-            modes.append('test')
-        if i <= prio_train:
-            modes.append('train')
-
         for model in models:
+            # select the window and pane
+            cur_win = int(math.ceil(cur_task / args.panes_per_window))
+            os.system('tmux select-window -t 0')
+            os.system(f'tmux select-window -t win_{cur_win}')
+            cur_pane = (cur_task - 1) % args.panes_per_window
+            os.system(f'tmux select-pane -t {cur_pane}')
+
+            cmd = f'conda activate {args.env}'
+            os.system(f'tmux send-keys "{cmd}" "C-m"')
+            cmd = f'echo executing task: {cur_task}'
+            os.system(f'tmux send-keys "{cmd}" "C-m"')
+
             cur_config = model['config']
             cur_checkpoint = model['checkpoint']
 
@@ -155,88 +171,98 @@ def main():
             else:
                 task_name = osp.splitext(osp.basename(cur_config))[0]
 
-            for mode in modes:
-                # select the window and pane
-                cur_win = int(math.ceil(cur_task / args.panes_per_window))
-                os.system('tmux select-window -t 0')
-                os.system(f'tmux select-window -t win_{cur_win}')
-                cur_pane = (cur_task - 1) % args.panes_per_window
-                os.system(f'tmux select-pane -t {cur_pane}')
+            cur_task_name = args.mode + '_' + task_name
+            cur_work_dir = osp.join(args.root_work_dir, cur_task_name)
+
+            # if the port is used, use a random number for port
+            while not is_port_available(cur_port, ip):
+                cur_port = random.randint(1000, 50000)
+
+            if args.mode == 'test':
+                cur_gpus = model['test']['gpus'] if 'test' in model.keys(
+                ) and 'gpus' in model['test'].keys(
+                ) else DEFAULT_TEST_ARGS['gpus']
+                cur_gpus_per_node = model['test'][
+                    'gpus_per_node'] if 'test' in model.keys(
+                    ) and 'gpus_per_node' in model['test'].keys(
+                    ) else DEFAULT_TEST_ARGS['gpus_per_node']
+                cur_cpus_per_task = model['test'][
+                    'cpus_per_task'] if 'test' in model.keys(
+                    ) and 'cpus_per_task' in model['test'].keys(
+                    ) else DEFAULT_TEST_ARGS['cpus_per_task']
+                cur_partition = model['test'][
+                    'partition'] if 'test' in model.keys(
+                    ) and 'partition' in model['test'].keys(
+                    ) else args.partition
 
-                cmd = f'conda activate {args.env}'
-                os.system(f'tmux send-keys "{cmd}" "C-m"')
-                cmd = f'echo executing task: {cur_task}'
+                # deal with extra python arguments
+                py_cmd = f' --work-dir {cur_work_dir} '
+
+                if 'test' in model.keys() and 'py_args' in model['test'].keys(
+                ):
+                    keys = list(model['test']['py_args'].keys())
+                    values = list(model['test']['py_args'].values())
+
+                    for k in range(len(keys)):
+                        if values[k] is None:
+                            if keys[k] in ['fuse_conv_bn', 'gpu_collect']:
+                                py_cmd += f' --{keys[k]} '
+                        else:
+                            py_cmd += f' --{keys[k]} {values[k]} '
+                cmd = f'MASTER_PORT={cur_port} GPUS={cur_gpus} ' + \
+                      f'GPUS_PER_NODE={cur_gpus_per_node} ' + \
+                      f'CPUS_PER_TASK={cur_cpus_per_task} ' + \
+                      f'./tools/slurm_test.sh {cur_partition} ' + \
+                      f'{cur_task_name} ' + \
+                      f'{cur_config} {cur_checkpoint} ' + \
+                      f'{py_cmd}'
                 os.system(f'tmux send-keys "{cmd}" "C-m"')
 
-                cur_partition = model[mode][
-                    'partition'] if 'partition' in model[mode].keys(
+            else:
+                cur_gpus = model['train']['gpus'] if 'train' in model.keys(
+                ) and 'gpus' in model['train'].keys(
+                ) else DEFAULT_TRAIN_ARGS['gpus']
+                cur_gpus_per_node = model['train'][
+                    'gpus_per_node'] if 'train' in model.keys(
+                    ) and 'gpus_per_node' in model['train'].keys(
+                    ) else DEFAULT_TRAIN_ARGS['gpus_per_node']
+                cur_cpus_per_task = model['train'][
+                    'cpus_per_task'] if 'train' in model.keys(
+                    ) and 'cpus_per_task' in model['train'].keys(
+                    ) else DEFAULT_TRAIN_ARGS['cpus_per_task']
+                cur_partition = model['train'][
+                    'partition'] if 'train' in model.keys(
+                    ) and 'partition' in model['train'].keys(
                     ) else args.partition
-                cur_gpus = model[mode]['gpus'] if 'gpus' in model[mode].keys(
-                ) else args.gpus
-                cur_gpus_per_node = model[mode][
-                    'gpus_per_node'] if 'gpus_per_node' in model[mode].keys(
-                    ) else args.gpus_per_node
-                cur_cpus_per_task = model[mode][
-                    'cpus_per_task'] if 'cpus_per_task' in model[mode].keys(
-                    ) else args.cpus_per_task
-
-                cur_task_name = mode + '_' + task_name
-                cur_work_dir = osp.join(args.root_work_dir, cur_task_name)
-
-                if mode == 'test':
-                    # deal with extra python arguments
-                    py_cmd = f' --work-dir {cur_work_dir} '
-                    if 'py_args' in model[mode].keys():
-                        keys = list(model[mode]['py_args'].keys())
-                        values = list(model[mode]['py_args'].values())
-
-                        for k in range(len(keys)):
-                            if values[k] is None:
-                                if keys[k] in ['fuse_conv_bn', 'gpu_collect']:
-                                    py_cmd += f' --{keys[k]} '
-                            else:
-                                py_cmd += f' --{keys[k]} {values[k]} '
-                    cmd = f'MASTER_PORT={cur_port} GPUS={cur_gpus} ' + \
-                          f'GPUS_PER_NODE={cur_gpus_per_node} ' + \
-                          f'CPUS_PER_TASK={cur_cpus_per_task} ' + \
-                          f'./tools/slurm_test.sh {cur_partition} ' + \
-                          f'{cur_task_name} ' + \
-                          f'{cur_config} {cur_checkpoint} ' + \
-                          f'{py_cmd}'
-
-                    os.system(f'tmux send-keys "{cmd}" "C-m"')
-
-                else:
-                    py_cmd = ' '
-                    # deal with extra python arguments
-                    if 'py_args' in model[mode].keys():
-                        keys = list(model[mode]['py_args'].keys())
-                        values = list(model[mode]['py_args'].values())
-
-                        for k in range(len(keys)):
-                            if values[k] is None:
-                                if keys[k] in [
-                                        'no-validate', 'deterministic',
-                                        'autoscale-lr'
-                                ]:
-                                    py_cmd += f' --{keys[k]} '
-                            else:
-                                py_cmd += f' --{keys[k]} {values[k]} '
-                    cmd = f'MASTER_PORT={cur_port} GPUS={cur_gpus} ' + \
-                          f'GPUS_PER_NODE={cur_gpus_per_node} ' + \
-                          f'CPUS_PER_TASK={cur_cpus_per_task} ' + \
-                          f'./tools/slurm_train.sh {cur_partition} ' + \
-                          f'{cur_task_name} ' + \
-                          f'{cur_config} {cur_work_dir} ' + \
-                          f'{py_cmd}'
-                    os.system(f'tmux send-keys "{cmd}" "C-m"')
-
-                cur_port += 1
-                # if the port is used, use a random number for port
-                while not is_port_available(cur_port, ip):
-                    cur_port = random.randint(29000, 39000)
-                print(f'port used in task {cur_task} is: {cur_port}')
-                cur_task += 1
+
+                # deal with extra python arguments
+                py_cmd = ' '
+                if 'train' in model.keys(
+                ) and 'py_args' in model['train'].keys():
+                    keys = list(model['train']['py_args'].keys())
+                    values = list(model['train']['py_args'].values())
+
+                    for k in range(len(keys)):
+                        if values[k] is None:
+                            if keys[k] in [
+                                    'no-validate', 'deterministic',
+                                    'autoscale-lr'
+                            ]:
+                                py_cmd += f' --{keys[k]} '
+                        else:
+                            py_cmd += f' --{keys[k]} {values[k]} '
+                cmd = f'MASTER_PORT={cur_port} GPUS={cur_gpus} ' + \
+                      f'GPUS_PER_NODE={cur_gpus_per_node} ' + \
+                      f'CPUS_PER_TASK={cur_cpus_per_task} ' + \
+                      f'./tools/slurm_train.sh {cur_partition} ' + \
+                      f'{cur_task_name} ' + \
+                      f'{cur_config} {cur_work_dir} ' + \
+                      f'{py_cmd}'
+                os.system(f'tmux send-keys "{cmd}" "C-m"')
+
+            print(f'port used in task {cur_task} is: {cur_port}')
+            cur_task += 1
+            cur_port += 1
 
     # close the base window
     os.system('tmux select-window -t 0')

diff --git a/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.md b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.md
@@ -36,4 +36,4 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da
 
 | Arch  | Input Size | AP | AP<sup>50</sup> | AP<sup>75</sup> | AR | AR<sup>50</sup> | ckpt | log |
 | :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
-| [S-ViPNAS-Res50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/s_vipnas_res50_coco_256x192.py)  | 256x192 | 0.711 | 0.893 | 0.789 | 0.769 | 0.769 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192_20210624.log.json) |
+| [S-ViPNAS-Res50](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py)  | 256x192 | 0.711 | 0.893 | 0.789 | 0.769 | 0.769 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192_20210624.log.json) |
diff --git a/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.yml b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.yml
@@ -8,11 +8,11 @@ Collections:
   - https://link.springer.com/chapter/10.1007/978-3-319-10602-1_48
   README: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.md
 Models:
-- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/s_vipnas_res50_coco_256x192.py
+- Config: configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_res50_coco_256x192.py
   In Collection: vipnas_coco
   Metadata:
     Training Data: COCO
-  Name: body--2d_kpt_sview_rgb_img--topdown_heatmap--coco--s_vipnas_res50_coco_256x192
+  Name: body--2d_kpt_sview_rgb_img--topdown_heatmap--coco--vipnas_res50_coco_256x192
   Results:
   - Dataset: COCO
     Metrics: