-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathrun
executable file
·148 lines (126 loc) · 4.53 KB
/
run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/bin/bash
nshards=1282
image=wdslightning
die() {
echo "ERROR: $*"
exit 1
}
require_tmux() {
env | grep -q TMUX || die "opens text windows; must run under tmux command"
}
check_shards() {
numshards=$(ls shards/imagenet-train-??????.tar | wc -l)
if [[ $numshards != $nshards ]]; then die "wrong # shards in ./shards (got: $numshards, wanted: $nshards)"; fi
}
cmd_clean() { # remove temporary files
rm -f *-info.txt
rm -f *.pth *.pth.tar
rm -f *.log
}
cmd_venv() { # set up a virtualenv
test -d venv || python3 -m venv venv
source venv/bin/activate
pip3 install -U pip
pip3 install -U numpy scipy
pip3 install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
pip3 install -U -r requirements.txt
}
cmd_makeshards() { # make shards from ImageNet data
test -d ./data/train || die "./data/train: does not exist"
test -d ./data/val || die "./data/val: does not exist"
test -d ./shards || die "./shards: must exist and be directory"
source venv/bin/activate
python3 makeshards.py "$@"
}
cmd_train() { # run training using PyTorch lightning
check_shards
source venv/bin/activate
python3 train.py --gpus 1 "$@"
}
cmd_spawn() { # run training using PyTorch lightning
check_shards
source venv/bin/activate
set -x
python3 train.py --gpus 2 --accelerator ddp_spawn "$@"
}
# env PL_TORCH_DISTRIBUTED_BACKEND=nccl PL_IN_DDP_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0 MASTER_ADDR=127.0.0.1 MASTER_PORT=25700 WORLD_SIZE=2 NODE_RANK=0 LOCAL_RANK=0 RANK=0 python3 train.py --gpus=1 --num_nodes=2 --accelerator=ddp "$@"
cmd_twonode() { # args: master rank; run DDP on two nodes; args: master_node rank [rest is imagenet.py args]
# set_nccl
master=$1; shift
rank=$1; shift
source venv/bin/activate
python -m torch.distributed.launch --use_env --nproc_per_node=1 --nnodes=2 --node_rank=$rank --master_addr=$master --master_port=9966 train.py --gpus 1 --accelerator ddp "$@"
}
cmd_tmtest() { # runs a simulated twonode test locally under TMUX
require_tmux
tmux split-window 'CUDA_VISIBLE_DEVICES=0 run twonode localhost 0 --model resnet18 --bucket fake:; sleep 9999'
tmux split-window 'CUDA_VISIBLE_DEVICES=1 run twonode localhost 1 --model resnet18 --bucket fake:; sleep 9999'
tmux select-layout even-vertical
}
cmd_tmfull() { # runs a simulated twonode test locally under TMUX
require_tmux
tmux split-window 'CUDA_VISIBLE_DEVICES=0 run twonode localhost 0 --model resnet18 --bucket ./shards; sleep 9999'
tmux split-window 'CUDA_VISIBLE_DEVICES=1 run twonode localhost 1 --model resnet18 --bucket ./shards; sleep 9999'
tmux select-layout even-vertical
}
cmd_build() { # build docker container
set -e
docker build -t $image - "$@" < Dockerfile
}
cmd_docker() { # run docker
test -d ./shards/. || die "./shards must exist"
docker run -ti \
--gpus ${gpu:-all} \
--runtime nvidia \
-v /etc/passwd:/etc/passwd \
-v /etc/group:/etc/group \
-u $(id -u):$(id -g) \
-v /tmp/.X11-unix:/tmp/.X11-unix \
-v $(pwd):/work \
-v $(readlink -s shards):/work/shards \
-w /work \
--ipc host \
--net host \
-e DISPLAY=$DISPLAY \
-e PS1='[[\w]]\$ ' \
$image \
"$@"
}
cmd_dtrain() { # run training using PyTorch lightning
check_shards
source venv/bin/activate
cmd_docker python3 train.py --gpus 1 "$@"
}
cmd_dtwonode() { # args: master rank; run DDP on two nodes; args: master_node rank [rest is imagenet.py args]
master=$1; shift
rank=$1; shift
set -x
cmd_docker python -m torch.distributed.launch --use_env --nproc_per_node=1 --nnodes=2 --node_rank=$rank --master_addr=$master --master_port=9966 train.py --gpus 1 --accelerator ddp "$@"
}
cmd_tmdtest() { # runs a simulated dtwonode test locally under TMUX
require_tmux
tmux split-window 'gpu=device=0 run dtwonode localhost 0 --model resnet18 --bucket fake:; sleep 9999'
tmux split-window 'gpu=device=1 run dtwonode localhost 1 --model resnet18 --bucket fake:; sleep 9999'
tmux select-layout even-vertical
}
cmd="${1:-help}"
shift
set -e
case $cmd in
help)
echo; echo available commands:; echo
grep '^cmd_[_0-9a-z]*() {' "$0" | sed 's/cmd_//;s/\(.*\)() *{* *#* */\1 -- /'
;;
*.py)
# cmd_versions
set -e
# hg status grep -v '^M ' > /dev/null
source venv/bin/activate
export OMP_NUM_THREADS=1
python3 "$cmd" "$@"
;;
*)
set -e
eval "cmd_$cmd" "$@"
;;
esac