From b073b2567bd2e369fe662ec628b19e91622d7021 Mon Sep 17 00:00:00 2001 From: fangyixiao18 Date: Fri, 25 Mar 2022 21:25:20 +0800 Subject: [PATCH 1/5] [Refactor] change args format to `--a-b` --- tools/slurm_train.sh | 2 +- tools/test.py | 13 +++++++++---- tools/train.py | 21 +++++++++++++++++---- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/tools/slurm_train.sh b/tools/slurm_train.sh index d8ecfcdf2..db7dc5392 100644 --- a/tools/slurm_train.sh +++ b/tools/slurm_train.sh @@ -22,4 +22,4 @@ srun -p ${PARTITION} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/train.py ${CONFIG} \ - --work_dir=${WORK_DIR} --seed 0 --launcher="slurm" ${PY_ARGS} + --work-dir=${WORK_DIR} --seed 0 --launcher="slurm" ${PY_ARGS} diff --git a/tools/test.py b/tools/test.py index 18c207032..19c364a40 100644 --- a/tools/test.py +++ b/tools/test.py @@ -23,9 +23,9 @@ def parse_args(): parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument( '--work_dir', - type=str, - default=None, - help='the dir to save logs and models') + help='(Deprecated, please use --work-dir) the dir to save logs and ' + 'models') + parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], @@ -37,7 +37,12 @@ def parse_args(): default=0, help='id of gpu to use ' '(only applicable to non-distributed testing)') - parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument( + '--local_rank', + type=int, + default=0, + help='(Deprecated, please use --local-rank)') + parser.add_argument('--local-rank', type=int, default=0) parser.add_argument( '--cfg-options', nargs='+', diff --git a/tools/train.py b/tools/train.py index 6bdecc0db..a38ef20f4 100644 --- a/tools/train.py +++ b/tools/train.py @@ -22,9 +22,17 @@ def parse_args(): parser = argparse.ArgumentParser(description='Train a model') parser.add_argument('config', help='train config file path') - parser.add_argument('--work_dir', help='the dir to save logs and models') parser.add_argument( - '--resume_from', help='the checkpoint file to resume from') + '--work_dir', + help='(Deprecated, please use --work-dir) the dir to save logs and ' + 'models') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume_from', + help='(Deprecated, please use --resume-from) the checkpoint file ' + 'to resume from') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') parser.add_argument( '--auto-resume', action='store_true', @@ -50,7 +58,7 @@ def parse_args(): '(only applicable to non-distributed training)') parser.add_argument('--seed', type=int, default=None, help='random seed') parser.add_argument( - '--diff_seed', + '--diff-seed', action='store_true', help='Whether or not set different seeds for different ranks') parser.add_argument( @@ -72,7 +80,12 @@ def parse_args(): choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') - parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument( + '--local_rank', + type=int, + default=0, + help='(Deprecated, please use --local-rank)') + parser.add_argument('--local-rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) From bde6cf637afb5fbc6b52c01bed9831d3302c053f Mon Sep 17 00:00:00 2001 From: fangyixiao18 Date: Fri, 25 Mar 2022 22:35:10 +0800 Subject: [PATCH 2/5] modify tsne script --- tools/analysis_tools/visualize_tsne.py | 69 ++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/tools/analysis_tools/visualize_tsne.py b/tools/analysis_tools/visualize_tsne.py index 41ae9f286..d977847d8 100644 --- a/tools/analysis_tools/visualize_tsne.py +++ b/tools/analysis_tools/visualize_tsne.py @@ -24,7 +24,10 @@ def parse_args(): parser.add_argument('config', help='train config file path') parser.add_argument('--checkpoint', default=None, help='checkpoint file') parser.add_argument( - '--work_dir', type=str, default=None, help='the dir to save results') + '--work_dir', + help='(Deprecated, please use --work-dir) the dir to save logs and ' + 'models') + parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], @@ -33,21 +36,45 @@ def parse_args(): parser.add_argument( '--dataset_config', default='configs/benchmarks/classification/tsne_imagenet.py', + help='(Deprecated, please use --dataset-config) ' + 'extract dataset config file path') + parser.add_argument( + '--dataset-config', + default='configs/benchmarks/classification/tsne_imagenet.py', help='extract dataset config file path') parser.add_argument( '--layer_ind', type=str, default='0,1,2,3,4', + help='(Deprecated, please use --layer-ind) layer indices, ' + 'separated by comma, e.g., "0,1,2,3,4"') + parser.add_argument( + '--layer-ind', + type=str, + default='0,1,2,3,4', help='layer indices, separated by comma, e.g., "0,1,2,3,4"') parser.add_argument( '--pool_type', choices=['specified', 'adaptive'], default='specified', + help='(Deprecated, please use --pool-type) Pooling type in ' + ':class:`MultiPooling`') + parser.add_argument( + '--pool-type', + choices=['specified', 'adaptive'], + default='specified', help='Pooling type in :class:`MultiPooling`') parser.add_argument( '--max_num_class', type=int, default=20, + help='(Deprecated, please use --max-num-class) the maximum number ' + 'of classes to apply t-SNE algorithms, now the function supports ' + 'maximum 20 classes') + parser.add_argument( + '--max-num-class', + type=int, + default=20, help='the maximum number of classes to apply t-SNE algorithms, now the' 'function supports maximum 20 classes') parser.add_argument('--seed', type=int, default=0, help='random seed') @@ -68,7 +95,13 @@ def parse_args(): # t-SNE settings parser.add_argument( - '--n_components', type=int, default=2, help='the dimension of results') + '--n_components', + type=int, + default=2, + help='(Deprecated, please use --n-components) the dimension of results' + ) + parser.add_argument( + '--n-components', type=int, default=2, help='the dimension of results') parser.add_argument( '--perplexity', type=float, @@ -79,12 +112,29 @@ def parse_args(): '--early_exaggeration', type=float, default=12.0, + help='(Deprecated, please use --early-exaggeration) Controls how ' + 'tight natural clusters in the original space are in the embedded ' + 'space and how much space will be between them.') + parser.add_argument( + '--early-exaggeration', + type=float, + default=12.0, help='Controls how tight natural clusters in the original space are in' 'the embedded space and how much space will be between them.') parser.add_argument( '--learning_rate', type=float, default=200.0, + help='(Deprecated, please use --learning-rate) The learning rate ' + 'for t-SNE is usually in the range [10.0, 1000.0]. ' + 'If the learning rate is too high, the data may look' + 'like a ball with any point approximately equidistant from its nearest' + 'neighbours. If the learning rate is too low, most points may look' + 'compressed in a dense cloud with few outliers.') + parser.add_argument( + '--learning-rate', + type=float, + default=200.0, help='The learning rate for t-SNE is usually in the range' '[10.0, 1000.0]. If the learning rate is too high, the data may look' 'like a ball with any point approximately equidistant from its nearest' @@ -94,14 +144,27 @@ def parse_args(): '--n_iter', type=int, default=1000, + help='(Deprecated, please use --n-iter) Maximum number of iterations ' + 'for the optimization. Should be at least 250.') + parser.add_argument( + '--n-iter', + type=int, + default=1000, help='Maximum number of iterations for the optimization. Should be at' 'least 250.') parser.add_argument( '--n_iter_without_progress', type=int, default=300, + help='(Deprecated, please use --n-iter-without-progress) Maximum ' + 'number of iterations without progress before we abort the ' + 'optimization.') + parser.add_argument( + '--n-iter-without-progress', + type=int, + default=300, help='Maximum number of iterations without progress before we abort' - 'the optimization') + 'the optimization.') parser.add_argument( '--init', type=str, default='random', help='The init method') args = parser.parse_args() From 6696dd7efb7a00bc055c9fc458e2b8156b1e35c0 Mon Sep 17 00:00:00 2001 From: fangyixiao18 Date: Sat, 26 Mar 2022 21:08:47 +0800 Subject: [PATCH 3/5] modify 'sh' files --- tools/benchmarks/classification/dist_train_linear.sh | 2 +- tools/benchmarks/classification/dist_train_semi.sh | 2 +- tools/benchmarks/classification/slurm_train_linear.sh | 2 +- tools/benchmarks/classification/slurm_train_semi.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/benchmarks/classification/dist_train_linear.sh b/tools/benchmarks/classification/dist_train_linear.sh index 19fa3f343..0ef37f88d 100644 --- a/tools/benchmarks/classification/dist_train_linear.sh +++ b/tools/benchmarks/classification/dist_train_linear.sh @@ -24,7 +24,7 @@ python -m torch.distributed.launch \ tools/train.py $CFG \ --cfg-options model.backbone.init_cfg.type=Pretrained \ model.backbone.init_cfg.checkpoint=$PRETRAIN \ - --work_dir $WORK_DIR \ + --work-dir $WORK_DIR \ --seed 0 \ --launcher="pytorch" \ ${PY_ARGS} diff --git a/tools/benchmarks/classification/dist_train_semi.sh b/tools/benchmarks/classification/dist_train_semi.sh index 5d51ae845..7af3782e6 100644 --- a/tools/benchmarks/classification/dist_train_semi.sh +++ b/tools/benchmarks/classification/dist_train_semi.sh @@ -25,7 +25,7 @@ python -m torch.distributed.launch \ tools/train.py $CFG \ --cfg-options model.backbone.init_cfg.type=Pretrained \ model.backbone.init_cfg.checkpoint=$PRETRAIN \ - --work_dir $WORK_DIR \ + --work-dir $WORK_DIR \ --seed 0 \ --launcher="pytorch" \ ${PY_ARGS} diff --git a/tools/benchmarks/classification/slurm_train_linear.sh b/tools/benchmarks/classification/slurm_train_linear.sh index 1628df8f1..ccbc412a6 100644 --- a/tools/benchmarks/classification/slurm_train_linear.sh +++ b/tools/benchmarks/classification/slurm_train_linear.sh @@ -30,4 +30,4 @@ srun -p ${PARTITION} \ --cfg-options model.backbone.init_cfg.type=Pretrained \ model.backbone.init_cfg.checkpoint=$PRETRAIN \ dist_params.port=$PORT \ - --work_dir $WORK_DIR --seed 0 --launcher="slurm" ${PY_ARGS} + --work-dir $WORK_DIR --seed 0 --launcher="slurm" ${PY_ARGS} diff --git a/tools/benchmarks/classification/slurm_train_semi.sh b/tools/benchmarks/classification/slurm_train_semi.sh index ff0ba0a51..1a8737d38 100644 --- a/tools/benchmarks/classification/slurm_train_semi.sh +++ b/tools/benchmarks/classification/slurm_train_semi.sh @@ -30,4 +30,4 @@ srun -p ${PARTITION} \ --cfg-options model.backbone.init_cfg.type=Pretrained \ model.backbone.init_cfg.checkpoint=$PRETRAIN \ dist_params.port=$PORT \ - --work_dir $WORK_DIR --seed 0 --launcher="slurm" ${PY_ARGS} + --work-dir $WORK_DIR --seed 0 --launcher="slurm" ${PY_ARGS} From d30c115d0ad710337dda86cbb79ee1adcb5c44ab Mon Sep 17 00:00:00 2001 From: fangyixiao18 Date: Sat, 26 Mar 2022 21:20:33 +0800 Subject: [PATCH 4/5] modify getting_started.md --- docs/en/getting_started.md | 2 +- docs/zh_cn/getting_started.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/getting_started.md b/docs/en/getting_started.md index 3bdadd174..db1b76926 100644 --- a/docs/en/getting_started.md +++ b/docs/en/getting_started.md @@ -157,7 +157,7 @@ python tools/model_converters/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENA We provide an off-the-shelf tool to visualize the quality of image representations by t-SNE. ```shell -python tools/analysis_tools/visualize_tsne.py ${CONFIG_FILE} --checkpoint ${CKPT_PATH} --work_dir ${WORK_DIR} [optional arguments] +python tools/analysis_tools/visualize_tsne.py ${CONFIG_FILE} --checkpoint ${CKPT_PATH} --work-dir ${WORK_DIR} [optional arguments] ``` Arguments: diff --git a/docs/zh_cn/getting_started.md b/docs/zh_cn/getting_started.md index 1be283e01..af0b9fb09 100644 --- a/docs/zh_cn/getting_started.md +++ b/docs/zh_cn/getting_started.md @@ -155,7 +155,7 @@ python tools/model_converters/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENA 我们提供了一个开箱即用的来做图片向量可视化的方法: ```shell -python tools/analysis_tools/visualize_tsne.py ${CONFIG_FILE} --checkpoint ${CKPT_PATH} --work_dir ${WORK_DIR} [optional arguments] +python tools/analysis_tools/visualize_tsne.py ${CONFIG_FILE} --checkpoint ${CKPT_PATH} --work-dir ${WORK_DIR} [optional arguments] ``` 参数: From c7c0a09da2483c845c8c5660a34b26e7c39d8db2 Mon Sep 17 00:00:00 2001 From: fangyixiao18 Date: Sat, 26 Mar 2022 21:48:49 +0800 Subject: [PATCH 5/5] modify getting_started.md --- docs/en/getting_started.md | 12 ++++++------ docs/zh_cn/getting_started.md | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/en/getting_started.md b/docs/en/getting_started.md index db1b76926..a8090aeec 100644 --- a/docs/en/getting_started.md +++ b/docs/en/getting_started.md @@ -31,19 +31,19 @@ python tools/train.py ${CONFIG_FILE} ### Train with single/multiple GPUs ```shell -sh tools/dist_train.sh ${CONFIG_FILE} ${GPUS} --work_dir ${YOUR_WORK_DIR} [optional arguments] +sh tools/dist_train.sh ${CONFIG_FILE} ${GPUS} --work-dir ${YOUR_WORK_DIR} [optional arguments] ``` Optional arguments are: -- `--resume_from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file. +- `--resume-from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file. - `--deterministic`: Switch on "deterministic" mode which slows down training but the results are reproducible. An example: ```shell # checkpoints and logs saved in WORK_DIR=work_dirs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k/ -sh tools/dist_train.sh configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py 8 --work_dir work_dirs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k/ +sh tools/dist_train.sh configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py 8 --work-dir work_dirs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k/ ``` **Note**: During training, checkpoints and logs are saved in the same folder structure as the config file under `work_dirs/`. Custom work directory is not recommended since evaluation scripts infer work directories from the config file name. If you want to save your weights somewhere else, please use symlink, for example: @@ -61,7 +61,7 @@ GPUS_PER_NODE=${GPUS_PER_NODE} GPUS=${GPUS} SRUN_ARGS=${SRUN_ARGS} sh tools/slur An example: ```shell -GPUS_PER_NODE=8 GPUS=8 sh tools/srun_train.sh Dummy Test_job configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py work_dirs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k/ +GPUS_PER_NODE=8 GPUS=8 sh tools/slurm_train.sh Dummy Test_job configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py work_dirs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k/ ``` ### Train with multiple machines @@ -91,8 +91,8 @@ If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training If you use `dist_train.sh` to launch training jobs: ```shell -CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 sh tools/dist_train.sh ${CONFIG_FILE} 4 --work_dir tmp_work_dir_1 -CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 sh tools/dist_train.sh ${CONFIG_FILE} 4 --work_dir tmp_work_dir_2 +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 sh tools/dist_train.sh ${CONFIG_FILE} 4 --work-dir tmp_work_dir_1 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 sh tools/dist_train.sh ${CONFIG_FILE} 4 --work-dir tmp_work_dir_2 ``` If you use launch training jobs with slurm, you have two options to set different communication ports: diff --git a/docs/zh_cn/getting_started.md b/docs/zh_cn/getting_started.md index af0b9fb09..5e3357d2c 100644 --- a/docs/zh_cn/getting_started.md +++ b/docs/zh_cn/getting_started.md @@ -31,12 +31,12 @@ python tools/train.py ${CONFIG_FILE} ### 使用 单张/多张 显卡训练 ```shell -sh tools/dist_train.sh ${CONFIG_FILE} ${GPUS} --work_dir ${YOUR_WORK_DIR} [optional arguments] +sh tools/dist_train.sh ${CONFIG_FILE} ${GPUS} --work-dir ${YOUR_WORK_DIR} [optional arguments] ``` 可选参数: -- `--resume_from ${CHECKPOINT_FILE}`: 从某个 checkpoint 处继续训练. +- `--resume-from ${CHECKPOINT_FILE}`: 从某个 checkpoint 处继续训练. - `--deterministic`: 开启 "deterministic" 模式, 虽然开启会使得训练速度降低,但是会保证结果可复现。 例如: @@ -57,7 +57,7 @@ GPUS_PER_NODE=${GPUS_PER_NODE} GPUS=${GPUS} SRUN_ARGS=${SRUN_ARGS} sh tools/slur 例如: ```shell -GPUS_PER_NODE=8 GPUS=8 sh tools/srun_train.sh Dummy Test_job configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py work_dirs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k/ +GPUS_PER_NODE=8 GPUS=8 sh tools/slurm_train.sh Dummy Test_job configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py work_dirs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k/ ``` ### 使用多台机器训练 @@ -87,8 +87,8 @@ NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_tr 如果您使用 `dist_train.sh` 来启动训练任务: ```shell -CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 sh tools/dist_train.sh ${CONFIG_FILE} 4 --work_dir tmp_work_dir_1 -CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 sh tools/dist_train.sh ${CONFIG_FILE} 4 --work_dir tmp_work_dir_2 +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 sh tools/dist_train.sh ${CONFIG_FILE} 4 --work-dir tmp_work_dir_1 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 sh tools/dist_train.sh ${CONFIG_FILE} 4 --work-dir tmp_work_dir_2 ``` 如果您使用 slurm 来启动训练任务,你有两种方式来为每个任务设置不同的端口: