From 5f16741da071ea79ab598a9661c3ad2c6bf62bb5 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Jan 2025 10:41:54 -0800 Subject: [PATCH] Log grad norm aggregated over all ranks, not just rank zero --- recipes/dev/early_exit_finetune_distributed.py | 2 +- recipes/full_finetune_distributed.py | 2 +- recipes/lora_finetune_distributed.py | 2 +- recipes/lora_finetune_distributed_multi_dataset.py | 2 +- recipes/qat_distributed.py | 2 +- recipes/qat_lora_finetune_distributed.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py index 5abc674356..663697e978 100644 --- a/recipes/dev/early_exit_finetune_distributed.py +++ b/recipes/dev/early_exit_finetune_distributed.py @@ -951,7 +951,7 @@ def train(self) -> None: grad_norm = torch.nn.utils.clip_grad_norm_( self._model.parameters(), max_norm=float(self._clip_grad_norm), - ) + ).full_tensor() self._optimizer.step() self._optimizer.zero_grad(set_to_none=True) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 9ef5e6533f..4f32faefdb 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -786,7 +786,7 @@ def train(self) -> None: grad_norm = torch.nn.utils.clip_grad_norm_( self._model.parameters(), max_norm=float(self._clip_grad_norm), - ) + ).full_tensor() self._optimizer.step() self._optimizer.zero_grad(set_to_none=True) diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 2cdfcd8010..39c8b104e5 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -828,7 +828,7 @@ def train(self) -> None: grad_norm = torch.nn.utils.clip_grad_norm_( self._model.parameters(), max_norm=float(self._clip_grad_norm), - ) + ).full_tensor() self._optimizer.step() self._optimizer.zero_grad(set_to_none=True) self._lr_scheduler.step() diff --git a/recipes/lora_finetune_distributed_multi_dataset.py b/recipes/lora_finetune_distributed_multi_dataset.py index ce482bfa27..a50147df8a 100644 --- a/recipes/lora_finetune_distributed_multi_dataset.py +++ b/recipes/lora_finetune_distributed_multi_dataset.py @@ -857,7 +857,7 @@ def train(self) -> None: grad_norm = torch.nn.utils.clip_grad_norm_( self._model.parameters(), max_norm=float(self._clip_grad_norm), - ) + ).full_tensor() self._optimizer.step() self._optimizer.zero_grad(set_to_none=True) self._lr_scheduler.step() diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index f1b1302b7d..8c458daa21 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -857,7 +857,7 @@ def train(self) -> None: grad_norm = torch.nn.utils.clip_grad_norm_( self._model.parameters(), max_norm=float(self._clip_grad_norm), - ) + ).full_tensor() self._optimizer.step() self._optimizer.zero_grad(set_to_none=True) diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py index 133c39c94b..c742dae226 100644 --- a/recipes/qat_lora_finetune_distributed.py +++ b/recipes/qat_lora_finetune_distributed.py @@ -872,7 +872,7 @@ def train(self) -> None: grad_norm = torch.nn.utils.clip_grad_norm_( self._model.parameters(), max_norm=float(self._clip_grad_norm), - ) + ).full_tensor() self._optimizer.step() self._optimizer.zero_grad(set_to_none=True) self._lr_scheduler.step()