PyTorch CUDA allocator optimization for dynamic batch shape dataloadi…

…ng in ASR (#9061) * Option to auto-set expandable_segments in PyTorch CUDA allocator Signed-off-by: Piotr Żelasko <[email protected]> * warning Signed-off-by: Piotr Żelasko <[email protected]> * set opts after parsing config Signed-off-by: Piotr Żelasko <[email protected]> --------- Signed-off-by: Piotr Żelasko <[email protected]>
NVIDIA · May 2, 2024 · 9100cfd · 9100cfd
1 parent f15e897
commit 9100cfd
Showing 1 changed file with 26 additions and 1 deletion.
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 import warnings
 from dataclasses import dataclass
 from functools import partial
@@ -74,6 +74,7 @@ class LhotseDataLoadingConfig:
     drop_last: bool = False
     shard_seed: int | str = "trng"
     max_open_streams: int | None = None
+    cuda_expandable_segments: bool = True
 
     # 2.1 Multimodal sampling override options
     use_multimodal_sampling: bool = False
@@ -150,6 +151,8 @@ def get_lhotse_dataloader_from_config(
 
     config = make_structured_with_schema_warnings(config)
 
+    maybe_set_cuda_expandable_segments(enabled=config.cuda_expandable_segments)
+
     # First, resolve the random seed in case a string value was provided.
     seed = resolve_seed(config.seed)
     fix_random_seed(seed)
@@ -451,6 +454,28 @@ def _flatten_alt_text(cut) -> list:
     return ans
 
 
+def maybe_set_cuda_expandable_segments(enabled: bool):
+    """
+    Configures PyTorch memory allocator to expand existing allocated segments
+    instead of re-allocating them when tensor shape grows.
+    This can help speed up the training when sequence length and/or batch size change often,
+    and makes GPU more robust towards OOM.
+
+    See here for more details:
+    https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf
+    """
+    if enabled and torch.cuda.is_available():
+        if (
+            (value := os.environ.get("PYTORCH_CUDA_ALLOC_CONF")) is not None
+            and len(value) > 0
+            and "expandable_segments:True" not in value
+        ):
+            warnings.warn(
+                "You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
+            )
+        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+
+
 def _select_channel(cut, channel_selector: int | str) -> list:
     if isinstance(channel_selector, int):
         channel_idx = channel_selector