add rank to logline for megatron strategy

Signed-off-by: Ananth Subramaniam <[email protected]>
ananthsub · Jan 28, 2025 · c2f960f · c2f960f
1 parent 2127dd3
commit c2f960f
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 4 deletions.
diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
@@ -231,7 +231,7 @@ def save_checkpoint(
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
         end_time = time.monotonic()
         logging.info(
-            f'Global Checkpoint Save: Start time : {start_time} s : Time spent in save_checkpoint: {end_time - start_time} s'
+            f'Global Checkpoint Save: Rank : {torch.distributed.get_rank()} : Start time : {start_time} s : Time spent in save_checkpoint: {end_time - start_time} s'
         )
 
     @override
@@ -268,7 +268,7 @@ def load_checkpoint(self, checkpoint_path: str | Path) -> Dict[str, Any]:
         checkpoint = self.checkpoint_io.load_checkpoint(path, sharded_state_dict=sharded_state_dict)
         end_time = time.monotonic()
         logging.info(
-            f'Global Checkpoint Load: Start time : {start_time} s : Time spent in load_checkpoint: {end_time - start_time} s'
+            f'Global Checkpoint Load: Rank : {torch.distributed.get_rank()} : Start time : {start_time} s : Time spent in load_checkpoint: {end_time - start_time} s'
         )
         mcore_to_pyt_sharded_state_dict(checkpoint['sharded_state_dict'], msd)
 

diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -750,11 +750,14 @@ def save_checkpoint(
             if self.ckpt_save_optimizer:
                 checkpoint["optimizer"] = [self.optimizer_sharded_state_dict()]
 
+        from nemo.utils import AppState
+
+        app_state = AppState()
         start_time = time.monotonic()
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
         end_time = time.monotonic()
         logging.info(
-            f'Global Checkpoint Save: Start time : {start_time} s : Time spent in save_checkpoint: {end_time - start_time} s'
+            f'Global Checkpoint Save: Rank : {app_state.global_rank} : Start time : {start_time} s : Time spent in save_checkpoint: {end_time - start_time} s'
         )
 
     def should_restore_optimizer_states(self, selective_restore: bool = False) -> bool:
@@ -788,13 +791,17 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path], selective_restore:
             self.lightning_module.strict_loading if self.ckpt_load_strictness is None else self.ckpt_load_strictness
         )
 
+        from nemo.utils import AppState
+
+        app_state = AppState()
+
         start_time = time.monotonic()
         checkpoint = self.checkpoint_io.load_checkpoint(
             checkpoint_path, sharded_state_dict=sharded_state_dict, strict=strict
         )
         end_time = time.monotonic()
         logging.info(
-            f'Global Checkpoint Load: Start time : {start_time} s : Time spent in load_checkpoint: {end_time - start_time} s'
+            f'Global Checkpoint Load: Rank : {app_state.global_rank} : Start time : {start_time} s : Time spent in load_checkpoint: {end_time - start_time} s'
         )
 
         if selective_restore: