-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpretrain.sh
33 lines (22 loc) · 849 Bytes
/
pretrain.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#! /bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
NNODES=$WORLD_SIZE # Adjust
GPUS_PER_NODE=8 # Adjust
GPU_NUM=$(($GPUS_PER_NODE*$NNODES))
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
echo "================================================"
echo "GPU_NUM: $GPU_NUM"
echo "================================================"
DISTRIBUTED_ARGS="\
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
"
echo $DISTRIBUTED_ARGS
mkdir -p OUTPUT/pretrain_convnext_base
torchrun $DISTRIBUTED_ARGS \
train.py \
--config_file=uni_interleaved/configs/train/pretrain.yaml \
--output_dir=OUTPUT/pretrain_convnext_base |tee -a OUTPUT/pretrain_convnext_base/pretrain_convnext_base_train.log