run

#!/bin/bash

nshards=1282
image=wdslightning

die() {
    echo "ERROR: $*"
    exit 1
}

require_tmux() {
    env | grep -q TMUX || die "opens text windows; must run under tmux command"
}

check_shards() {
    numshards=$(ls shards/imagenet-train-??????.tar | wc -l)
    if [[ $numshards != $nshards ]]; then die "wrong # shards in ./shards (got: $numshards, wanted: $nshards)"; fi
}


cmd_clean() { # remove temporary files
    rm -f *-info.txt
    rm -f *.pth *.pth.tar
    rm -f *.log
}

cmd_venv() { # set up a virtualenv
    test -d venv || python3 -m venv venv
    source venv/bin/activate
    pip3 install -U pip
    pip3 install -U numpy scipy
    pip3 install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
    pip3 install -U -r requirements.txt
}

cmd_makeshards() {  # make shards from ImageNet data
    test -d ./data/train || die "./data/train: does not exist"
    test -d ./data/val || die "./data/val: does not exist"
    test -d ./shards || die "./shards: must exist and be directory"
    source venv/bin/activate
    python3 makeshards.py "$@"
}

cmd_train() { # run training using PyTorch lightning
    check_shards
    source venv/bin/activate
    python3 train.py --gpus 1 "$@"
}

cmd_spawn() { # run training using PyTorch lightning
    check_shards
    source venv/bin/activate
    set -x
    python3 train.py --gpus 2 --accelerator ddp_spawn "$@"
}

# env PL_TORCH_DISTRIBUTED_BACKEND=nccl PL_IN_DDP_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0 MASTER_ADDR=127.0.0.1 MASTER_PORT=25700 WORLD_SIZE=2 NODE_RANK=0 LOCAL_RANK=0 RANK=0 python3 train.py --gpus=1 --num_nodes=2 --accelerator=ddp "$@"

cmd_twonode() {  # args: master rank; run DDP on two nodes; args: master_node rank [rest is imagenet.py args]
    # set_nccl
    master=$1; shift
    rank=$1; shift
    source venv/bin/activate
    python -m torch.distributed.launch --use_env --nproc_per_node=1 --nnodes=2 --node_rank=$rank --master_addr=$master --master_port=9966 train.py --gpus 1 --accelerator ddp "$@"
}

cmd_tmtest() {  # runs a simulated twonode test locally under TMUX
    require_tmux
    tmux split-window 'CUDA_VISIBLE_DEVICES=0 run twonode localhost 0 --model resnet18 --bucket fake:; sleep 9999'
    tmux split-window 'CUDA_VISIBLE_DEVICES=1 run twonode localhost 1 --model resnet18 --bucket fake:; sleep 9999'
    tmux select-layout even-vertical
}

cmd_tmfull() {  # runs a simulated twonode test locally under TMUX
    require_tmux
    tmux split-window 'CUDA_VISIBLE_DEVICES=0 run twonode localhost 0 --model resnet18 --bucket ./shards; sleep 9999'
    tmux split-window 'CUDA_VISIBLE_DEVICES=1 run twonode localhost 1 --model resnet18 --bucket ./shards; sleep 9999'
    tmux select-layout even-vertical
}

cmd_build() {  # build docker container
    set -e
    docker build -t $image - "$@" < Dockerfile
}

cmd_docker() {  # run docker
    test -d ./shards/. || die "./shards must exist"
    docker run -ti \
        --gpus ${gpu:-all} \
        --runtime nvidia \
        -v /etc/passwd:/etc/passwd \
        -v /etc/group:/etc/group \
        -u $(id -u):$(id -g) \
        -v /tmp/.X11-unix:/tmp/.X11-unix \
        -v $(pwd):/work \
        -v $(readlink -s shards):/work/shards \
        -w /work \
        --ipc host \
        --net host \
        -e DISPLAY=$DISPLAY \
        -e PS1='[[\w]]\$ ' \
        $image \
        "$@"
}

cmd_dtrain() { # run training using PyTorch lightning
    check_shards
    source venv/bin/activate
    cmd_docker python3 train.py --gpus 1 "$@"
}

cmd_dtwonode() {  # args: master rank; run DDP on two nodes; args: master_node rank [rest is imagenet.py args]
    master=$1; shift
    rank=$1; shift
    set -x
    cmd_docker python -m torch.distributed.launch --use_env --nproc_per_node=1 --nnodes=2 --node_rank=$rank --master_addr=$master --master_port=9966 train.py --gpus 1 --accelerator ddp "$@"
}

cmd_tmdtest() {  # runs a simulated dtwonode test locally under TMUX
    require_tmux
    tmux split-window 'gpu=device=0 run dtwonode localhost 0 --model resnet18 --bucket fake:; sleep 9999'
    tmux split-window 'gpu=device=1 run dtwonode localhost 1 --model resnet18 --bucket fake:; sleep 9999'
    tmux select-layout even-vertical
}

cmd="${1:-help}"
shift

set -e

case $cmd in
help)
    echo; echo available commands:; echo
    grep '^cmd_[_0-9a-z]*() {' "$0" | sed 's/cmd_//;s/\(.*\)() *{* *#* */\1 -- /'
    ;;
*.py)
    # cmd_versions
    set -e
    # hg status grep -v '^M ' > /dev/null
    source venv/bin/activate
    export OMP_NUM_THREADS=1
    python3 "$cmd" "$@"
    ;;
*)
    set -e
    eval "cmd_$cmd" "$@"
    ;;
esac