-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathjob.slurm
executable file
·109 lines (80 loc) · 2.68 KB
/
job.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
#SBATCH --job-name=stu # Name of the job
#SBATCH --nodes=1 # Number of nodes
#SBATCH --ntasks-per-node=1 # Each node runs 1 task that manages all GPUs
#SBATCH --gpus-per-task=8 # Number of GPUs to allocate per task
#SBATCH --cpus-per-task=8 # Must match >= GPUs on the task
#SBATCH --mem=48G # Total memory for job
#SBATCH --time=15:59:00 # Max time limit
#SBATCH --error=stu_%j.err
#SBATCH --output=stu_%j.out
# Logging
log_info() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
# Error handling
set -e
trap 'log_info "Error on line $LINENO"; exit 1' ERR
# Activate your virtual environment accordingly
source activate .venv/bin/activate
# Get the first node (master node) from the SLURM_JOB_NODELIST
MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
# Get the IP address of the master node
MASTER_NODE_ADDR=$(srun --nodes=1 --ntasks=1 -w "$MASTER_NODE" hostname --ip-address)
# Find an available port
RDZV_PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
# Log start of training run
log_info "Starting training run..."
# Run the script using torchrun
torchrun \
--nnodes 1 \
--nproc_per_node 8 \
--rdzv_id $SLURM_JOB_ID \
--rdzv_backend c10d \
--rdzv_endpoint $MASTER_NODE_ADDR:$RDZV_PORT \
--max-restarts 16 \
example.py
# Log end of training run
log_info "Job finished."
##################### HELPFUL SLURM COMMANDS #####################
# Disk usage and quota info; request additional space via link.
# `checkquota`
# Operating system details.
# `cat /etc/os-release`
# CPU specifications on the current node.
# `lscpu`
# Compute node information (standard and easy-to-read formats).
# `snodes`
# `shownodes`
# Cluster nodes usage overview; check status (idle, down, busy).
# `sinfo`
# GPU usage specifics.
# `sinfo -p gpu`
# Quality of Service insights: job partitions and limits.
# `qos`
# Current processor activity; exit with 'q'.
# `top`
# `htop`
# Overview of group-assigned cluster shares.
# `sshare`
# Job priority mechanics: factors and weights.
# `sprio -w`
# Performance of a completed job by job ID.
# `seff <jobid>`
# Your historical job records.
# `shistory`
# Detailed job statistics (memory, CPU, GPU).
# `jobstats <jobid>`
# Additional commands from your list for GPU details:
# Details about GPUs on the cluster.
# `snodes`
# Number of available GPUs.
# `shownodes -p gpu,mig`
# GPU utilization, refreshed every 10 min.
# `gpudash`
# Specific to your jobs.
# `gpudash -u $USER`
# Real-time GPU status on active jobs.
# `nvidia-smi` OR `watch nvidia-smi`
# Your queue status.
# `squeue -u $USER`