Skip to content

Commit

Permalink
Merge pull request #2478 from Zth9730/allocator_strategy
Browse files Browse the repository at this point in the history
[ASR] Chang memory allocator strategy to fix gpu training hang
  • Loading branch information
zh794390558 authored Sep 29, 2022
2 parents 764fa0a + 404708c commit a657cc3
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 0 deletions.
4 changes: 4 additions & 0 deletions examples/aishell/asr0/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down
4 changes: 4 additions & 0 deletions examples/aishell/asr1/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ echo ${ips_config}

mkdir -p exp

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down
4 changes: 4 additions & 0 deletions examples/librispeech/asr0/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down
4 changes: 4 additions & 0 deletions examples/librispeech/asr1/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ fi
# export FLAGS_cudnn_exhaustive_search=true
# export FLAGS_conv_workspace_size_limit=4000

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down
4 changes: 4 additions & 0 deletions examples/librispeech/asr2/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down
4 changes: 4 additions & 0 deletions examples/timit/asr1/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down
4 changes: 4 additions & 0 deletions examples/tiny/asr0/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ fi

mkdir -p exp

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down
4 changes: 4 additions & 0 deletions examples/tiny/asr1/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ fi

mkdir -p exp

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down
4 changes: 4 additions & 0 deletions examples/wenetspeech/asr1/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ echo ${ips_config}

mkdir -p exp

# default memeory allocator strategy may case gpu training hang
# for no OOM raised when memory exhaused
export FLAGS_allocator_strategy=naive_best_fit

if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
Expand Down

0 comments on commit a657cc3

Please sign in to comment.