diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index 8c1d2003cb1b..5f35c1884a41 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -153,10 +153,10 @@ def step_fused_adam(self, closure=None): if self.overflow: if self.verbose: - logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format( - prev_scale, - self.cur_scale)) + logger.info( + "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, + self.cur_scale)) return self.overflow combined_scale = self.unscale_and_clip_grads(grads_groups_flat, norm_groups, diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py index ad6f8f6227f9..954d0ea61585 100755 --- a/deepspeed/runtime/fp16/loss_scaler.py +++ b/deepspeed/runtime/fp16/loss_scaler.py @@ -213,7 +213,7 @@ def update_scale(self, overflow): optimizer.step() # Otherwise, don't do anything -- ie, skip iteration else: - print('OVERFLOW!') + print('fp16 dynamic loss scale overflow!') # Update loss scale for next iteration loss_scaler.update_scale(has_overflow) diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 37edf9d5002d..c0cef6a56ba7 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -139,10 +139,10 @@ def step_fused_lamb(self, closure=None): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format( - prev_scale, - self.cur_scale)) + logger.info( + "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, + self.cur_scale)) return self.overflow combined_scale = self.unscale_and_clip_grads(norm_groups, apply_scale=False) @@ -165,10 +165,10 @@ def step(self, closure=None): self._update_scale(self.overflow) if self.overflow: if self.verbose: - logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format( - prev_scale, - self.cur_scale)) + logger.info( + "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, + self.cur_scale)) return self.overflow norm_groups = [] diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py index d5c7616ff87e..7cd37f904faa 100755 --- a/deepspeed/runtime/zero/stage1.py +++ b/deepspeed/runtime/zero/stage1.py @@ -630,10 +630,10 @@ def step(self, closure=None): if self.overflow: self.zero_grad() if self.verbose: - logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " - "scale: {}, reducing to {}".format( - prev_scale, - self.loss_scale)) + logger.info( + "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " + "scale: {}, reducing to {}".format(prev_scale, + self.loss_scale)) return self.overflow norm_groups = [] diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index 4cf39cb5ccd6..6f3fb1cd6509 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -1355,7 +1355,7 @@ def step(self, closure=None): see_memory_usage('After overflow after clearing gradients') logger.info( - "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, " + "[deepspeed] fp16 dynamic loss scale overflow! Rank {} Skipping step. Attempted loss scale: {}, " "reducing to {}".format(dist.get_rank(), prev_scale, self.loss_scale))