Skip to content

Commit

Permalink
PyTorch CUDA allocator optimization for dynamic batch shape dataloadi…
Browse files Browse the repository at this point in the history
…ng in ASR (#9061)

* Option to auto-set expandable_segments in PyTorch CUDA allocator

Signed-off-by: Piotr Żelasko <[email protected]>

* warning

Signed-off-by: Piotr Żelasko <[email protected]>

* set opts after parsing config

Signed-off-by: Piotr Żelasko <[email protected]>

---------

Signed-off-by: Piotr Żelasko <[email protected]>
  • Loading branch information
pzelasko authored May 2, 2024
1 parent f15e897 commit 9100cfd
Showing 1 changed file with 26 additions and 1 deletion.
27 changes: 26 additions & 1 deletion nemo/collections/common/data/lhotse/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import warnings
from dataclasses import dataclass
from functools import partial
Expand Down Expand Up @@ -74,6 +74,7 @@ class LhotseDataLoadingConfig:
drop_last: bool = False
shard_seed: int | str = "trng"
max_open_streams: int | None = None
cuda_expandable_segments: bool = True

# 2.1 Multimodal sampling override options
use_multimodal_sampling: bool = False
Expand Down Expand Up @@ -150,6 +151,8 @@ def get_lhotse_dataloader_from_config(

config = make_structured_with_schema_warnings(config)

maybe_set_cuda_expandable_segments(enabled=config.cuda_expandable_segments)

# First, resolve the random seed in case a string value was provided.
seed = resolve_seed(config.seed)
fix_random_seed(seed)
Expand Down Expand Up @@ -451,6 +454,28 @@ def _flatten_alt_text(cut) -> list:
return ans


def maybe_set_cuda_expandable_segments(enabled: bool):
"""
Configures PyTorch memory allocator to expand existing allocated segments
instead of re-allocating them when tensor shape grows.
This can help speed up the training when sequence length and/or batch size change often,
and makes GPU more robust towards OOM.
See here for more details:
https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf
"""
if enabled and torch.cuda.is_available():
if (
(value := os.environ.get("PYTORCH_CUDA_ALLOC_CONF")) is not None
and len(value) > 0
and "expandable_segments:True" not in value
):
warnings.warn(
"You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
)
torch.cuda.memory._set_allocator_settings("expandable_segments:True")


def _select_channel(cut, channel_selector: int | str) -> list:
if isinstance(channel_selector, int):
channel_idx = channel_selector
Expand Down

0 comments on commit 9100cfd

Please sign in to comment.