Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PyTorch CUDA allocator optimization for dynamic batch shape dataloading in ASR #9061

Merged
merged 5 commits into from
May 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion nemo/collections/common/data/lhotse/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import warnings
from dataclasses import dataclass
from functools import partial
Expand Down Expand Up @@ -74,6 +74,7 @@ class LhotseDataLoadingConfig:
drop_last: bool = False
shard_seed: int | str = "trng"
max_open_streams: int | None = None
cuda_expandable_segments: bool = True

# 2.1 Multimodal sampling override options
use_multimodal_sampling: bool = False
Expand Down Expand Up @@ -150,6 +151,8 @@ def get_lhotse_dataloader_from_config(

config = make_structured_with_schema_warnings(config)

maybe_set_cuda_expandable_segments(enabled=config.cuda_expandable_segments)

# First, resolve the random seed in case a string value was provided.
seed = resolve_seed(config.seed)
fix_random_seed(seed)
Expand Down Expand Up @@ -451,6 +454,28 @@ def _flatten_alt_text(cut) -> list:
return ans


def maybe_set_cuda_expandable_segments(enabled: bool):
"""
Configures PyTorch memory allocator to expand existing allocated segments
instead of re-allocating them when tensor shape grows.
This can help speed up the training when sequence length and/or batch size change often,
and makes GPU more robust towards OOM.

See here for more details:
https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf
"""
if enabled and torch.cuda.is_available():
if (
(value := os.environ.get("PYTORCH_CUDA_ALLOC_CONF")) is not None
and len(value) > 0
and "expandable_segments:True" not in value
):
warnings.warn(
"You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
)
torch.cuda.memory._set_allocator_settings("expandable_segments:True")


def _select_channel(cut, channel_selector: int | str) -> list:
if isinstance(channel_selector, int):
channel_idx = channel_selector
Expand Down
Loading