Add NVTX ranges to categorize execution (#11945)

* Add NVTX ranges to optimizer step Signed-off-by: Jaemin Choi <[email protected]> * Use Tim's module Signed-off-by: Jaemin Choi <[email protected]> * Fix NVTX functions import Signed-off-by: Jaemin Choi <[email protected]> * Apply isort and black reformatting Signed-off-by: minitu <[email protected]> Signed-off-by: Jaemin Choi <[email protected]> * Apply isort and black reformatting Signed-off-by: minitu <[email protected]> Signed-off-by: Jaemin Choi <[email protected]> * Use NsysCallback and AppState Signed-off-by: Jaemin Choi <[email protected]> * Apply isort and black reformatting Signed-off-by: minitu <[email protected]> Signed-off-by: Jaemin Choi <[email protected]> * Add nvtx_label Signed-off-by: Jaemin Choi <[email protected]> * Add option to profile all ranks Signed-off-by: Jaemin Choi <[email protected]> * Update NVTX label for MCore optimizer Signed-off-by: Jaemin Choi <[email protected]> * Add NVTX range for data step Signed-off-by: Jaemin Choi <[email protected]> * Apply isort and black reformatting Signed-off-by: minitu <[email protected]> Signed-off-by: Jaemin Choi <[email protected]> * Remove NVTX range for gpt_data_step Signed-off-by: Jaemin Choi <[email protected]> * Cleanup Signed-off-by: Jaemin Choi <[email protected]> * Use stack to keep track of NVTX ranges Signed-off-by: Jaemin Choi <[email protected]> * Apply isort and black reformatting Signed-off-by: minitu <[email protected]> Signed-off-by: Jaemin Choi <[email protected]> * Apply isort and black reformatting Signed-off-by: minitu <[email protected]> Signed-off-by: Jaemin Choi <[email protected]> * Capitalize NVTX label Signed-off-by: Jaemin Choi <[email protected]> * Fix linting failure Signed-off-by: Jaemin Choi <[email protected]> --------- Signed-off-by: Jaemin Choi <[email protected]> Signed-off-by: minitu <[email protected]> Co-authored-by: Jaemin Choi <[email protected]> Co-authored-by: minitu <[email protected]>
NVIDIA · Mar 4, 2025 · a1a9745 · a1a9745
1 parent 06684a6
commit a1a9745
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 2 deletions.
diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
@@ -14,6 +14,8 @@
 
 import torch
 
+from nemo.utils.nvtx import nvtx_range_pop, nvtx_range_push
+
 
 def _filter_empty_common_step(state_dict):
     """
@@ -42,6 +44,8 @@ class McoreDistributedOptimizer(torch.optim.Optimizer):
         optim (MegatronOptimizer): The distributed optimizer from Megatron Core.
     """
 
+    NVTX_LABEL = "nemo.core.optim.mcore_optim"
+
     def __init__(self, optim):
         self.defaults = {}
         self.mcore_optimizer = optim
@@ -121,10 +125,14 @@ def step(self, closure=None):
         # Apply closure
         loss = None
         if closure is not None:
+            nvtx_range_push(f"{McoreDistributedOptimizer.NVTX_LABEL}.step.closure")
             loss = closure()
+            nvtx_range_pop(f"{McoreDistributedOptimizer.NVTX_LABEL}.step.closure")
 
         # return unused update_successful, grad_norm, num_zeros_in_grad
+        nvtx_range_push(f"{McoreDistributedOptimizer.NVTX_LABEL}.step.step")
         _, grad_norm, num_zeros_in_grad = self.mcore_optimizer.step()
+        nvtx_range_pop(f"{McoreDistributedOptimizer.NVTX_LABEL}.step.step")
 
         return loss, grad_norm, num_zeros_in_grad
 

diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py
@@ -18,6 +18,7 @@
 from lightning.pytorch.callbacks.callback import Callback
 
 from nemo.utils import logging
+from nemo.utils.app_state import AppState
 from nemo.utils.get_rank import get_rank
 
 
@@ -48,9 +49,10 @@ class NsysCallback(Callback):
         end_step (int): Global batch to end profiling
         ranks (List[int]): Global rank IDs to profile
         gen_shape (bool): Generate model and kernel details including input shapes
+        nvtx_ranges (bool): Insert NVTX ranges to categorize execution
 
     Example:
-        >>> callback = NsysCallback(start_step=100, end_step=200, ranks=[0, 1], gen_shape=True)
+        >>> callback = NsysCallback(start_step=100, end_step=200, ranks=[0, 1], gen_shape=True, nvtx_ranges=False)
         >>> trainer = Trainer(callbacks=[callback])
     """
 
@@ -60,6 +62,7 @@ def __init__(
         end_step: int,
         ranks: List[int] = [0],
         gen_shape: bool = False,
+        nvtx_ranges: bool = False,
     ):
         assert type(start_step) is int, f'Nsys start_step must be of type int. Found: {type(start_step)}'
         self._nsys_profile_start_step = start_step
@@ -74,6 +77,9 @@ def __init__(
         self._nsys_profile_ranks = ranks
         self._nsys_profile_gen_shape = gen_shape
 
+        app_state = AppState()
+        app_state._nvtx_ranges = nvtx_ranges
+
         logging.info(
             f'Nsys profiling setup with start_step: {self._nsys_profile_start_step},'
             f'and end_step: {self._nsys_profile_end_step}'

diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py
@@ -14,7 +14,7 @@
 
 from dataclasses import dataclass
 from threading import Lock
-from typing import Dict, Optional
+from typing import Optional
 
 from nemo.utils.metaclasses import Singleton
 
@@ -93,6 +93,9 @@ def __init__(self):
         # command-ling arguments for run
         self._cmd_args = None
 
+        # Insert NVTX ranges to categorize execution
+        self._nvtx_ranges = False
+
     @property
     def device_id(self):
         """Property returns the device_id

diff --git a/nemo/utils/nvtx.py b/nemo/utils/nvtx.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from typing import Optional
+
+import torch
+
+from nemo.utils.app_state import AppState
+
+# pylint: disable=C0116
+
+
+@functools.lru_cache(maxsize=None)
+def _nvtx_enabled() -> bool:
+    """Check if NVTX range profiling is enabled"""
+    return AppState()._nvtx_ranges
+
+
+# Messages associated with active NVTX ranges
+_nvtx_range_messages: list[str] = []
+
+
+def nvtx_range_push(msg: str) -> None:
+    # Return immediately if NVTX range profiling is not enabled
+    if not _nvtx_enabled():
+        return
+
+    # Push NVTX range to stack
+    _nvtx_range_messages.append(msg)
+    torch.cuda.nvtx.range_push(msg)
+
+
+def nvtx_range_pop(msg: Optional[str] = None) -> None:
+    # Return immediately if NVTX range profiling is not enabled
+    if not _nvtx_enabled():
+        return
+
+    # Update list of NVTX range messages and check for consistency
+    if not _nvtx_range_messages:
+        raise RuntimeError("Attempted to pop NVTX range from empty stack")
+    last_msg = _nvtx_range_messages.pop()
+    if msg is not None and msg != last_msg:
+        raise ValueError(
+            f"Attempted to pop NVTX range from stack with msg={msg}, " f"but last range has msg={last_msg}"
+        )
+
+    # Pop NVTX range
+    torch.cuda.nvtx.range_pop()
+
+
+# pylint: enable=C0116