Merge pull request #2478 from Zth9730/allocator_strategy

[ASR] Chang memory allocator strategy to fix gpu training hang
PaddlePaddle · Sep 29, 2022 · a657cc3 · a657cc3
2 parents 764fa0a + 404708c
commit a657cc3
Show file tree

Hide file tree

Showing 9 changed files with 36 additions and 0 deletions.
diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \

diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh
@@ -35,6 +35,10 @@ echo ${ips_config}
 
 mkdir -p exp
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \

diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \

diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh
@@ -29,6 +29,10 @@ fi
 # export FLAGS_cudnn_exhaustive_search=true
 # export FLAGS_conv_workspace_size_limit=4000
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \

diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh
@@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \

diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh
@@ -19,6 +19,10 @@ if [ ${seed} != 0  ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \

diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh
@@ -32,6 +32,10 @@ fi
 
 mkdir -p exp
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \

diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh
@@ -34,6 +34,10 @@ fi
 
 mkdir -p exp
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \

diff --git a/examples/wenetspeech/asr1/local/train.sh b/examples/wenetspeech/asr1/local/train.sh
@@ -35,6 +35,10 @@ echo ${ips_config}
 
 mkdir -p exp
 
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \