From 535acf60c6ea6f1e3e61ca91a3842a616d9f612b Mon Sep 17 00:00:00 2001 From: Will Constable Date: Sun, 14 Jul 2024 21:17:02 -0700 Subject: [PATCH] [Cleanup] Remove libuv from run_llama_train.sh libuv is now enabled by default. we can proably do without the educational blurb there, and don't need the env either since the default has landed. ghstack-source-id: 68c8d2abe7eb0777e2add8df7634367c31b7ec06 Pull Request resolved: https://github.com/pytorch/torchtitan/pull/453 --- create_seed_checkpoint.sh | 1 - multinode_trainer.slurm | 1 - run_llama_train.sh | 3 --- 3 files changed, 5 deletions(-) diff --git a/create_seed_checkpoint.sh b/create_seed_checkpoint.sh index 1abc77ec..3dfbde71 100755 --- a/create_seed_checkpoint.sh +++ b/create_seed_checkpoint.sh @@ -18,7 +18,6 @@ set -ex -export USE_LIBUV=1 TRAINER_DIR=${1:-/home/$USER/local/torchtitan} NGPU=1 LOG_RANK=0 diff --git a/multinode_trainer.slurm b/multinode_trainer.slurm index 09b94ef1..4bc495d3 100644 --- a/multinode_trainer.slurm +++ b/multinode_trainer.slurm @@ -53,7 +53,6 @@ export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond" export NCCL_BUFFSIZE=2097152 #export TORCH_DIST_INIT_BARRIER=1 export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 -#export USE_LIBUV=1 CONFIG_FILE=${CONFIG_FILE:-"./train_configs/llama2_13b.toml"} dcgmi profile --pause diff --git a/run_llama_train.sh b/run_llama_train.sh index cf4943a6..5a661284 100755 --- a/run_llama_train.sh +++ b/run_llama_train.sh @@ -7,9 +7,6 @@ set -ex -# libUV is a scalable backend for TCPStore which is used in processGroup -# rendezvous. This is the recommended backend for distributed training. -export USE_LIBUV=1 TRAINER_DIR=${TRAINER_DIR:-/home/$USER/local/torchtitan} # use envs as local overrides for convenience