Skip to content

Commit

Permalink
Fix inference race condition (jpata#359)
Browse files Browse the repository at this point in the history
* fix: race condition in run_predictions()

* chore: add ray[train] to requirements.txt

* chore: update flatiron batch scripts

* fix: python path in flatiron batch scripts

* fix: add mlcroissant to requirements.txt

* fix: sort requirements.txt
  • Loading branch information
erwulff authored Oct 31, 2024
1 parent 39921a2 commit 243fc61
Show file tree
Hide file tree
Showing 13 changed files with 79 additions and 16 deletions.
2 changes: 2 additions & 0 deletions mlpf/model/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,8 @@ def run(rank, world_size, config, args, outdir, logfile):
jet_match_dr=0.1,
dir_name=testdir_name,
)
if world_size > 1:
dist.barrier() # block until all workers finished executing run_predictions()

if (rank == 0) or (rank == "cpu"): # make plots only on a single machine
if args.make_plots:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ jupyter-book
keras
keras-tuner
matplotlib
mlcroissant
mplhep
networkx
nevergrad
Expand All @@ -26,7 +27,7 @@ plotly
pre-commit
protobuf
pyarrow
ray[tune]
ray[train,tune]
scikit-learn
scikit-optimize
scipy
Expand Down
31 changes: 28 additions & 3 deletions scripts/flatiron/pt_raytrain_a100.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus-per-node=4
#SBATCH --gpus-per-task=4
#SBATCH --cpus-per-task=64
#SBATCH --constraint=a100-80gb&sxm4
Expand All @@ -27,12 +28,36 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

export CUDA_VISIBLE_DEVICES=0,1,2,3
num_gpus=${SLURM_GPUS_PER_TASK} # gpus per compute node
num_gpus=$((SLURM_GPUS_PER_NODE)) # gpus per compute node

export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} # necessary on JURECA for Ray to work

## Disable Ray Usage Stats
export RAY_USAGE_STATS_DISABLE=1

echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
echo "DEBUG: SLURM_NODEID: $SLURM_NODEID"
echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID"
echo "DEBUG: SLURM_PROCID: $SLURM_PROCID"
echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "DEBUG: SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES"
echo "DEBUG: SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
echo "DEBUG: SLURM_GPUS_PER_TASK: $SLURM_GPUS_PER_TASK"
echo "DEBUG: SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE"
echo "DEBUG: SLURM_GPUS: $SLURM_GPUS"
echo "DEBUG: num_gpus: $num_gpus"


if [ "$SLURM_JOB_NUM_NODES" -gt 1 ]; then
Expand Down Expand Up @@ -81,7 +106,7 @@ python3 -u mlpf/pipeline.py --train --ray-train \
--config $1 \
--prefix $2 \
--ray-cpus $((SLURM_CPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \
--gpus $((SLURM_GPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \
--gpus $((SLURM_GPUS_PER_NODE*SLURM_JOB_NUM_NODES)) \
--gpu-batch-multiplier 8 \
--num-workers 8 \
--prefetch-factor 16 \
Expand Down
32 changes: 29 additions & 3 deletions scripts/flatiron/pt_raytrain_h100.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#SBATCH --exclusive
#SBATCH --tasks-per-node=1
#SBATCH -p gpu
#SBATCH --gpus-per-node=8
#SBATCH --gpus-per-task=8
#SBATCH --cpus-per-task=64
#SBATCH --constraint=ib-h100p
Expand All @@ -27,12 +28,37 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
num_gpus=${SLURM_GPUS_PER_TASK} # gpus per compute node
num_gpus=$((SLURM_GPUS_PER_NODE)) # gpus per compute node

export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} # necessary on JURECA for Ray to work

## Disable Ray Usage Stats
export RAY_USAGE_STATS_DISABLE=1

echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
echo "DEBUG: SLURM_NODEID: $SLURM_NODEID"
echo "DEBUG: SLURM_LOCALID: $SLURM_LOCALID"
echo "DEBUG: SLURM_PROCID: $SLURM_PROCID"
echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "DEBUG: SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES"
echo "DEBUG: SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
echo "DEBUG: SLURM_GPUS_PER_TASK: $SLURM_GPUS_PER_TASK"
echo "DEBUG: SLURM_GPUS_PER_NODE: $SLURM_GPUS_PER_NODE"
echo "DEBUG: SLURM_GPUS: $SLURM_GPUS"
echo "DEBUG: num_gpus: $num_gpus"


if [ "$SLURM_JOB_NUM_NODES" -gt 1 ]; then
################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ###############
Expand Down Expand Up @@ -80,7 +106,7 @@ python3 -u mlpf/pipeline.py --train --ray-train \
--config $1 \
--prefix $2 \
--ray-cpus $((SLURM_CPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \
--gpus $((SLURM_GPUS_PER_TASK*SLURM_JOB_NUM_NODES)) \
--gpus $((SLURM_GPUS_PER_NODE*SLURM_JOB_NUM_NODES)) \
--gpu-batch-multiplier 8 \
--num-workers 4 \
--prefetch-factor 8 \
Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_raytune_a100_1GPUperTrial.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_raytune_a100_2GPUsperTrial.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_raytune_a100_4GPUsperTrial.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_raytune_h100_1GPUperTrial.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_raytune_h100_2GPUsperTrial.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_raytune_h100_4GPUsperTrial.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_raytune_h100_8GPUsperTrial.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_test.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down
3 changes: 2 additions & 1 deletion scripts/flatiron/pt_train.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ module --force purge; module load modules/2.2-20230808
module load slurm gcc cmake cuda/12.1.1 cudnn/8.9.2.26-12.x nccl openmpi apptainer

nvidia-smi
source ~/miniconda3/bin/activate pytorch
export PYTHONPATH=`pwd`
source ~/miniforge3/bin/activate mlpf
which python3
python3 --version

Expand Down

0 comments on commit 243fc61

Please sign in to comment.