Skip to content

Commit

Permalink
Add NVTX ranges to categorize execution (#11945)
Browse files Browse the repository at this point in the history
* Add NVTX ranges to optimizer step

Signed-off-by: Jaemin Choi <[email protected]>

* Use Tim's module

Signed-off-by: Jaemin Choi <[email protected]>

* Fix NVTX functions import

Signed-off-by: Jaemin Choi <[email protected]>

* Apply isort and black reformatting

Signed-off-by: minitu <[email protected]>
Signed-off-by: Jaemin Choi <[email protected]>

* Apply isort and black reformatting

Signed-off-by: minitu <[email protected]>
Signed-off-by: Jaemin Choi <[email protected]>

* Use NsysCallback and AppState

Signed-off-by: Jaemin Choi <[email protected]>

* Apply isort and black reformatting

Signed-off-by: minitu <[email protected]>
Signed-off-by: Jaemin Choi <[email protected]>

* Add nvtx_label

Signed-off-by: Jaemin Choi <[email protected]>

* Add option to profile all ranks

Signed-off-by: Jaemin Choi <[email protected]>

* Update NVTX label for MCore optimizer

Signed-off-by: Jaemin Choi <[email protected]>

* Add NVTX range for data step

Signed-off-by: Jaemin Choi <[email protected]>

* Apply isort and black reformatting

Signed-off-by: minitu <[email protected]>
Signed-off-by: Jaemin Choi <[email protected]>

* Remove NVTX range for gpt_data_step

Signed-off-by: Jaemin Choi <[email protected]>

* Cleanup

Signed-off-by: Jaemin Choi <[email protected]>

* Use stack to keep track of NVTX ranges

Signed-off-by: Jaemin Choi <[email protected]>

* Apply isort and black reformatting

Signed-off-by: minitu <[email protected]>
Signed-off-by: Jaemin Choi <[email protected]>

* Apply isort and black reformatting

Signed-off-by: minitu <[email protected]>
Signed-off-by: Jaemin Choi <[email protected]>

* Capitalize NVTX label

Signed-off-by: Jaemin Choi <[email protected]>

* Fix linting failure

Signed-off-by: Jaemin Choi <[email protected]>

---------

Signed-off-by: Jaemin Choi <[email protected]>
Signed-off-by: minitu <[email protected]>
Co-authored-by: Jaemin Choi <[email protected]>
Co-authored-by: minitu <[email protected]>
  • Loading branch information
3 people authored Mar 4, 2025
1 parent 06684a6 commit a1a9745
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 2 deletions.
8 changes: 8 additions & 0 deletions nemo/core/optim/mcore_optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import torch

from nemo.utils.nvtx import nvtx_range_pop, nvtx_range_push


def _filter_empty_common_step(state_dict):
"""
Expand Down Expand Up @@ -42,6 +44,8 @@ class McoreDistributedOptimizer(torch.optim.Optimizer):
optim (MegatronOptimizer): The distributed optimizer from Megatron Core.
"""

NVTX_LABEL = "nemo.core.optim.mcore_optim"

def __init__(self, optim):
self.defaults = {}
self.mcore_optimizer = optim
Expand Down Expand Up @@ -121,10 +125,14 @@ def step(self, closure=None):
# Apply closure
loss = None
if closure is not None:
nvtx_range_push(f"{McoreDistributedOptimizer.NVTX_LABEL}.step.closure")
loss = closure()
nvtx_range_pop(f"{McoreDistributedOptimizer.NVTX_LABEL}.step.closure")

# return unused update_successful, grad_norm, num_zeros_in_grad
nvtx_range_push(f"{McoreDistributedOptimizer.NVTX_LABEL}.step.step")
_, grad_norm, num_zeros_in_grad = self.mcore_optimizer.step()
nvtx_range_pop(f"{McoreDistributedOptimizer.NVTX_LABEL}.step.step")

return loss, grad_norm, num_zeros_in_grad

Expand Down
8 changes: 7 additions & 1 deletion nemo/lightning/pytorch/callbacks/nsys.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from lightning.pytorch.callbacks.callback import Callback

from nemo.utils import logging
from nemo.utils.app_state import AppState
from nemo.utils.get_rank import get_rank


Expand Down Expand Up @@ -48,9 +49,10 @@ class NsysCallback(Callback):
end_step (int): Global batch to end profiling
ranks (List[int]): Global rank IDs to profile
gen_shape (bool): Generate model and kernel details including input shapes
nvtx_ranges (bool): Insert NVTX ranges to categorize execution
Example:
>>> callback = NsysCallback(start_step=100, end_step=200, ranks=[0, 1], gen_shape=True)
>>> callback = NsysCallback(start_step=100, end_step=200, ranks=[0, 1], gen_shape=True, nvtx_ranges=False)
>>> trainer = Trainer(callbacks=[callback])
"""

Expand All @@ -60,6 +62,7 @@ def __init__(
end_step: int,
ranks: List[int] = [0],
gen_shape: bool = False,
nvtx_ranges: bool = False,
):
assert type(start_step) is int, f'Nsys start_step must be of type int. Found: {type(start_step)}'
self._nsys_profile_start_step = start_step
Expand All @@ -74,6 +77,9 @@ def __init__(
self._nsys_profile_ranks = ranks
self._nsys_profile_gen_shape = gen_shape

app_state = AppState()
app_state._nvtx_ranges = nvtx_ranges

logging.info(
f'Nsys profiling setup with start_step: {self._nsys_profile_start_step},'
f'and end_step: {self._nsys_profile_end_step}'
Expand Down
5 changes: 4 additions & 1 deletion nemo/utils/app_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from dataclasses import dataclass
from threading import Lock
from typing import Dict, Optional
from typing import Optional

from nemo.utils.metaclasses import Singleton

Expand Down Expand Up @@ -93,6 +93,9 @@ def __init__(self):
# command-ling arguments for run
self._cmd_args = None

# Insert NVTX ranges to categorize execution
self._nvtx_ranges = False

@property
def device_id(self):
"""Property returns the device_id
Expand Down
63 changes: 63 additions & 0 deletions nemo/utils/nvtx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
from typing import Optional

import torch

from nemo.utils.app_state import AppState

# pylint: disable=C0116


@functools.lru_cache(maxsize=None)
def _nvtx_enabled() -> bool:
"""Check if NVTX range profiling is enabled"""
return AppState()._nvtx_ranges


# Messages associated with active NVTX ranges
_nvtx_range_messages: list[str] = []


def nvtx_range_push(msg: str) -> None:
# Return immediately if NVTX range profiling is not enabled
if not _nvtx_enabled():
return

# Push NVTX range to stack
_nvtx_range_messages.append(msg)
torch.cuda.nvtx.range_push(msg)


def nvtx_range_pop(msg: Optional[str] = None) -> None:
# Return immediately if NVTX range profiling is not enabled
if not _nvtx_enabled():
return

# Update list of NVTX range messages and check for consistency
if not _nvtx_range_messages:
raise RuntimeError("Attempted to pop NVTX range from empty stack")
last_msg = _nvtx_range_messages.pop()
if msg is not None and msg != last_msg:
raise ValueError(
f"Attempted to pop NVTX range from stack with msg={msg}, " f"but last range has msg={last_msg}"
)

# Pop NVTX range
torch.cuda.nvtx.range_pop()


# pylint: enable=C0116

0 comments on commit a1a9745

Please sign in to comment.