From df4d846dc6a9759eb5a9ef616b8ce41436625f7f Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 4 May 2021 23:22:45 +0200
Subject: [PATCH 01/32] Refactor global step update

---
 pytorch_lightning/trainer/training_loop.py | 40 +++++-----------------
 1 file changed, 8 insertions(+), 32 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 8c510f08a83fc..67bfa8a551b57 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -512,28 +512,17 @@ def run_training_epoch(self):
             self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics)
             self.trainer.checkpoint_connector.has_trained = True
 
-            # max steps reached, end training
-            if (
-                self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step + 1
-                and self._accumulated_batches_reached()
-            ):
-                break
-
-            # end epoch early
-            # stop when the flag is changed or we've gone past the amount
-            # requested in the batches
-            if self.trainer.should_stop:
-                break
-
             self.trainer.total_batch_idx += 1
 
-            # stop epoch if we limited the number of training batches
-            if self._num_training_batches_reached(is_last_batch):
-                break
-
             # progress global step according to grads progress
             self.increment_accumulated_grad_global_step()
 
+            max_steps_reached = (
+                self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step
+            )
+            if max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(is_last_batch):
+                break
+
         if batch_idx is None:
             # dataloader/iterator did not produce a batch
             return
@@ -555,15 +544,6 @@ def run_training_epoch(self):
         if should_train_only:
             self.check_checkpoint_callback(True)
 
-        if should_check_val:
-            self.trainer.validating = True
-            self.trainer.run_evaluation(on_epoch=True)
-            self.trainer.training = True
-
-        # increment the global step once
-        # progress global step according to grads progress
-        self.increment_accumulated_grad_global_step()
-
     def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None:
         # inform logger the batch loop has finished
         self.trainer.logger_connector.on_train_epoch_end()
@@ -863,16 +843,12 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool, on_epoch: bo
         elif self.trainer.val_check_batch != float('inf'):
             is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0
 
-        # Note: num_training_batches is also inf for iterable datasets with no length defined
-        epoch_end_val_check = (batch_idx + 1) % self.trainer.num_training_batches == 0
         is_last_batch_for_infinite_dataset = is_last_batch and self.trainer.val_check_batch == float("inf")
 
         if on_epoch:
-            return (
-                is_val_check_batch and epoch_end_val_check
-            ) or self.trainer.should_stop or is_last_batch_for_infinite_dataset
+            return is_val_check_batch or self.trainer.should_stop or is_last_batch_for_infinite_dataset
         else:
-            return is_val_check_batch and not epoch_end_val_check
+            return is_val_check_batch
 
     def build_train_args(self, batch, batch_idx, opt_idx, hiddens):
         # enable not needing to add opt_idx to training_step

From 8e402a43f77b7c05cea0669c6450e236584a2020 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 6 May 2021 17:55:55 +0200
Subject: [PATCH 02/32] WIP

---
 pytorch_lightning/trainer/training_loop.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 67bfa8a551b57..3598e31abc3d0 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -458,10 +458,7 @@ def run_training_epoch(self):
 
         train_dataloader = self.trainer.data_connector.get_profiled_train_dataloader(train_dataloader)
         dataloader_idx = 0
-        val_loop_called = False
-
         batch_idx = None
-        is_last_batch = None
 
         for batch_idx, (batch, is_last_batch) in train_dataloader:
             self.trainer.batch_idx = batch_idx
@@ -500,7 +497,6 @@ def run_training_epoch(self):
                 self.trainer.validating = True
                 self.trainer.run_evaluation()
                 self.trainer.training = True
-                val_loop_called = True
 
             # -----------------------------------------
             # SAVE LOGGERS (ie: Tensorboard, etc...)
@@ -527,23 +523,24 @@ def run_training_epoch(self):
             # dataloader/iterator did not produce a batch
             return
 
+        #self.trainer.global_step -= 1
+
         # handle epoch_output on epoch end
         self.on_train_epoch_end(epoch_output)
 
         # log epoch metrics
         self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output)
 
-        should_check_val = self._should_check_val_fx(batch_idx, is_last_batch, on_epoch=True)
+        # TODO: make sure we don't update these again if already updated inside the batch loop
+        self.trainer.optimizer_connector.update_learning_rates(interval='epoch')
+
         should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches)
         should_train_only = self.trainer.disable_validation or should_skip_eval
-
-        # update epoch level lr_schedulers if no val loop outside train loop is triggered
-        if (val_loop_called and not should_check_val) or should_train_only:
-            self.trainer.optimizer_connector.update_learning_rates(interval='epoch')
-
         if should_train_only:
             self.check_checkpoint_callback(True)
 
+        #self.trainer.global_step += 1
+
     def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None:
         # inform logger the batch loop has finished
         self.trainer.logger_connector.on_train_epoch_end()

From 2404d58914e1bfada9eb8e274a9881963a5da5e5 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 24 May 2021 13:24:38 +0200
Subject: [PATCH 03/32] WIP

---
 pytorch_lightning/trainer/training_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 429a8ea2e678c..aee116e1c9c45 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -550,12 +550,12 @@ def run_training_epoch(self):
         # log epoch metrics
         self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output)
 
-        # TODO: make sure we don't update these again if already updated inside the batch loop
         self.trainer.optimizer_connector.update_learning_rates(interval='epoch')
 
         should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches)
         should_train_only = self.trainer.disable_validation or should_skip_eval
         if should_train_only:
+            # if validation has run, this should have been called already
             self.check_checkpoint_callback(True)
 
         #self.trainer.global_step += 1
@@ -843,7 +843,7 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs):
             # track gradients
             result.grad_norm_dict = self.track_and_norm_grad(optimizer=optimizer)
 
-    def update_train_loop_lr_schedulers(self, monitor_metrics=None):
+    def update_train_loop_lr_schedulers(self, interval: str, monitor_metrics: Dict[str, _METRIC] = None) -> None:
         num_accumulated_batches_reached = self._accumulated_batches_reached()
         num_training_batches_reached = self._num_training_batches_reached()
 

From 3e5e0876f905d8616cd1156124a504a9f1fc9a2f Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 24 May 2021 13:47:27 +0200
Subject: [PATCH 04/32] WIP

---
 .../trainer/connectors/optimizer_connector.py | 21 ++++++--------
 pytorch_lightning/trainer/trainer.py          | 13 +--------
 pytorch_lightning/trainer/training_loop.py    | 29 +++++++++----------
 3 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py
index e7fbdf9b18c02..2797504288bd3 100644
--- a/pytorch_lightning/trainer/connectors/optimizer_connector.py
+++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py
@@ -11,30 +11,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, List, Optional
+from typing import List, Optional
+from weakref import proxy
 
+import pytorch_lightning as pl
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class OptimizerConnector:
 
-    def __init__(self, trainer):
-        self.trainer = trainer
+    def __init__(self, trainer: 'pl.Trainer') -> None:
+        self.trainer = proxy(trainer)
 
-    def on_trainer_init(self):
+    def on_trainer_init(self) -> None:
         self.trainer.lr_schedulers = []
         self.trainer.optimizers = []
         self.trainer.optimizer_frequencies = []
 
-    def update_learning_rates(
-        self, interval: str, monitor_metrics: Optional[Dict[str, Any]] = None, opt_indices: Optional[List[int]] = None
-    ):
+    def update_learning_rates(self, interval: str, opt_indices: Optional[List[int]] = None) -> None:
         """Update learning rates.
 
         Args:
             interval: either 'epoch' or 'step'.
-            monitor_metrics: dict of possible values to monitor
+            opt_indices: indices of the optimizers to update.
         """
         if not self.trainer.lr_schedulers or not self.trainer.lightning_module.automatic_optimization:
             return
@@ -55,10 +55,7 @@ def update_learning_rates(
                 monitor_key, monitor_val = None, None
                 if lr_scheduler['reduce_on_plateau']:
                     monitor_key = lr_scheduler['monitor']
-                    monitor_val = (
-                        monitor_metrics.get(monitor_key) if monitor_metrics is not None else
-                        self.trainer.logger_connector.callback_metrics.get(monitor_key)
-                    )
+                    monitor_val = self.trainer.logger_connector.callback_metrics.get(monitor_key)
                     if monitor_val is None:
                         if lr_scheduler.get('strict', True):
                             avail_metrics = list(self.trainer.logger_connector.callback_metrics.keys())
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 6a20625978e39..db7f99add85e9 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -923,7 +923,7 @@ def _run_train(self) -> None:
             self.state.stage = None
             raise
 
-    def _run_evaluation(self, on_epoch: bool = False) -> _EVALUATE_OUTPUT:
+    def _run_evaluation(self) -> _EVALUATE_OUTPUT:
         if not (self.evaluating or self.sanity_checking):
             rank_zero_warn(
                 f"`trainer._run_evaluation()` was called but the running stage is set to {self.state.stage}."
@@ -1005,17 +1005,6 @@ def _run_evaluation(self, on_epoch: bool = False) -> _EVALUATE_OUTPUT:
         # hook
         self.evaluation_loop.on_evaluation_epoch_end()
 
-        # update epoch-level lr_schedulers
-        if on_epoch:
-            self.optimizer_connector.update_learning_rates(
-                interval='epoch',
-                opt_indices=[
-                    opt_idx for opt_idx, _ in self.train_loop.get_active_optimizers(
-                        batch_idx=(self.train_loop.total_batch_idx - 1)
-                    )  # Select the optimizers which were used in the last batch of the epoch
-                ],
-            )
-
         # log epoch metrics
         eval_loop_results = self.logger_connector.get_evaluate_epoch_results()
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index aee116e1c9c45..a60ab7a5ccc32 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -34,6 +34,7 @@
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.parsing import AttributeDict
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
+from pytorch_lightning.utilities.types import _METRIC
 from pytorch_lightning.utilities.warnings import WarningCache
 
 
@@ -523,11 +524,10 @@ def run_training_epoch(self):
             self.save_loggers_on_train_batch_end()
 
             # update LR schedulers
-            monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics)
-            self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics)
+            self.update_lr_schedulers('step')
             self.trainer.checkpoint_connector.has_trained = True
 
-            self.trainer.total_batch_idx += 1
+            self.total_batch_idx += 1
 
             # progress global step according to grads progress
             self.increment_accumulated_grad_global_step()
@@ -550,7 +550,7 @@ def run_training_epoch(self):
         # log epoch metrics
         self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output)
 
-        self.trainer.optimizer_connector.update_learning_rates(interval='epoch')
+        self.update_lr_schedulers('epoch')
 
         should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches)
         should_train_only = self.trainer.disable_validation or should_skip_eval
@@ -843,17 +843,16 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs):
             # track gradients
             result.grad_norm_dict = self.track_and_norm_grad(optimizer=optimizer)
 
-    def update_train_loop_lr_schedulers(self, interval: str, monitor_metrics: Dict[str, _METRIC] = None) -> None:
-        num_accumulated_batches_reached = self._accumulated_batches_reached()
-        num_training_batches_reached = self._num_training_batches_reached()
-
-        if num_accumulated_batches_reached or num_training_batches_reached:
-            # update lr
-            self.trainer.optimizer_connector.update_learning_rates(
-                interval="step",
-                monitor_metrics=monitor_metrics,
-                opt_indices=[opt_idx for opt_idx, _ in self.get_active_optimizers()],
-            )
+    def update_lr_schedulers(self, interval: str) -> None:
+        if interval == "step":
+            finished_accumulation = self._accumulated_batches_reached()
+            finished_epoch = self._num_training_batches_reached()
+            if not finished_accumulation and not finished_epoch:
+                return
+        self.trainer.optimizer_connector.update_learning_rates(
+            interval=interval,
+            opt_indices=[opt_idx for opt_idx, _ in self.get_active_optimizers()],
+        )
 
     def increment_accumulated_grad_global_step(self):
         num_accumulated_batches_reached = self._accumulated_batches_reached()

From d274165c56cffb6ed7c69179116a9f194ba833ba Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 24 May 2021 14:34:23 +0200
Subject: [PATCH 05/32] Fix tests

---
 pytorch_lightning/trainer/training_loop.py |  10 +-
 tests/models/test_hooks.py                 |  33 +++++-
 tests/trainer/loops/test_training_loop.py  | 112 ---------------------
 3 files changed, 36 insertions(+), 119 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index a60ab7a5ccc32..7707e40d5a611 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -532,9 +532,7 @@ def run_training_epoch(self):
             # progress global step according to grads progress
             self.increment_accumulated_grad_global_step()
 
-            max_steps_reached = (
-                self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step
-            )
+            max_steps_reached = (self.max_steps is not None and self.max_steps <= self.global_step)
             if max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(is_last_batch):
                 break
 
@@ -542,7 +540,8 @@ def run_training_epoch(self):
             # dataloader/iterator did not produce a batch
             return
 
-        #self.trainer.global_step -= 1
+        # TODO
+        # self.global_step -= 1
 
         # handle epoch_output on epoch end
         self.on_train_epoch_end(epoch_output)
@@ -558,7 +557,8 @@ def run_training_epoch(self):
             # if validation has run, this should have been called already
             self.check_checkpoint_callback(True)
 
-        #self.trainer.global_step += 1
+        # TODO
+        # self.global_step += 1
 
     def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None:
         # inform logger the batch loop has finished
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 78f8d2c0a94e9..789afad2909cc 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -258,6 +258,26 @@ def __init__(self):
             super().__init__()
             self.called = []
 
+        def training_step(self, *args, **kwargs):
+            self.called.append("training_step")
+            return super().training_step(*args, **kwargs)
+
+        def optimizer_zero_grad(self, *args, **kwargs):
+            self.called.append("optimizer_zero_grad")
+            super().optimizer_zero_grad(*args, **kwargs)
+
+        def backward(self, *args, **kwargs):
+            self.called.append("backward")
+            super().backward(*args, **kwargs)
+
+        def optimizer_step(self, *args, **kwargs):
+            super().optimizer_step(*args, **kwargs)
+            self.called.append("optimizer_step")  # append after as closure calls other methods
+
+        def training_epoch_end(self, *args, **kwargs):
+            self.called.append("training_epoch_end")
+            super().training_epoch_end(*args, **kwargs)
+
         def on_after_backward(self):
             self.called.append("on_after_backward")
             super().on_after_backward()
@@ -455,18 +475,24 @@ def teardown(self, stage=None):
         'on_before_batch_transfer',
         'transfer_batch_to_device',
         'on_after_batch_transfer',
+        'training_step',
         'on_before_zero_grad',
+        'optimizer_zero_grad',
+        'backward',
         'on_after_backward',
+        'optimizer_step',
         'on_train_batch_end',
         'on_train_batch_start',
         'on_before_batch_transfer',
         'transfer_batch_to_device',
         'on_after_batch_transfer',
+        'training_step',
         'on_before_zero_grad',
+        'optimizer_zero_grad',
+        'backward',
         'on_after_backward',
+        'optimizer_step',
         'on_train_batch_end',
-        'on_train_epoch_end',
-        'on_epoch_end',
         'on_validation_model_eval',
         'on_validation_start',
         'on_epoch_start',
@@ -481,6 +507,9 @@ def teardown(self, stage=None):
         'on_save_checkpoint',
         'on_validation_end',
         'on_validation_model_train',
+        'training_epoch_end',
+        'on_train_epoch_end',
+        'on_epoch_end',
         'on_train_end',
         'on_fit_end',
         'teardown_fit',
diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py
index 2d32d8c8878e4..89a2ae0ec0d61 100644
--- a/tests/trainer/loops/test_training_loop.py
+++ b/tests/trainer/loops/test_training_loop.py
@@ -17,118 +17,6 @@
 from tests.helpers import BoringModel
 
 
-def test_training_loop_hook_call_order(tmpdir):
-    """Tests that hooks / methods called in the training loop are in the correct order as detailed in the docs:
-    https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#hooks"""
-
-    class HookedModel(BoringModel):
-
-        def __init__(self):
-            super().__init__()
-            self.called = []
-
-        def on_epoch_start(self):
-            self.called.append("on_epoch_start")
-            super().on_epoch_start()
-
-        def on_train_epoch_start(self):
-            self.called.append("on_train_epoch_start")
-            super().on_train_epoch_start()
-
-        def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
-            self.called.append("on_train_batch_start")
-            super().on_train_batch_start(batch, batch_idx, dataloader_idx)
-
-        def training_step(self, batch, batch_idx):
-            self.called.append("training_step")
-            return super().training_step(batch, batch_idx)
-
-        def on_before_zero_grad(self, optimizer):
-            self.called.append("on_before_zero_grad")
-            super().on_before_zero_grad(optimizer)
-
-        def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx):
-            self.called.append("optimizer_zero_grad")
-            super().optimizer_zero_grad(epoch, batch_idx, optimizer, optimizer_idx)
-
-        def backward(self, loss, optimizer, optimizer_idx, *args, **kwargs):
-            self.called.append("backward")
-            super().backward(loss, optimizer, optimizer_idx, *args, **kwargs)
-
-        def on_after_backward(self):
-            self.called.append("on_after_backward")
-            super().on_after_backward()
-
-        def optimizer_step(
-            self,
-            epoch,
-            batch_idx,
-            optimizer,
-            optimizer_idx,
-            optimizer_closure,
-            on_tpu,
-            using_native_amp,
-            using_lbfgs,
-        ):
-            super().optimizer_step(
-                epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs
-            )
-            self.called.append("optimizer_step")  # append after as closure calls other methods
-
-        def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
-            self.called.append("on_train_batch_end")
-            super().on_train_batch_end(outputs, batch, batch_idx, dataloader_idx)
-
-        def training_epoch_end(self, outputs):
-            self.called.append("training_epoch_end")
-            super().training_epoch_end(outputs)
-
-        def on_train_epoch_end(self, outputs):
-            self.called.append("on_train_epoch_end")
-            super().on_train_epoch_end(outputs)
-
-        def on_epoch_end(self):
-            self.called.append("on_epoch_end")
-            super().on_epoch_end()
-
-    model = HookedModel()
-
-    # fit model
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        limit_val_batches=1,
-        limit_train_batches=1,
-        limit_test_batches=1,
-        progress_bar_refresh_rate=0,
-        weights_summary=None,
-    )
-
-    assert model.called == []
-
-    trainer.fit(model)
-    expected = [
-        "on_epoch_start",  # validation
-        "on_epoch_end",
-        "on_epoch_start",  # training
-        "on_train_epoch_start",
-        "on_train_batch_start",
-        "training_step",
-        "on_before_zero_grad",
-        "optimizer_zero_grad",
-        "backward",
-        "on_after_backward",
-        "optimizer_step",
-        "on_train_batch_end",
-        "training_epoch_end",
-        "on_train_epoch_end",
-        "on_epoch_end",
-        "on_epoch_start",  # validation
-        "on_epoch_end",
-    ]
-    assert model.called == expected
-
-
 def test_outputs_format(tmpdir):
     """Tests that outputs objects passed to model hooks and methods are consistent and in the correct format."""
 

From 517970a7328128c74641e42158ed1716474728c2 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 24 May 2021 15:52:20 +0200
Subject: [PATCH 06/32] Fix tests

---
 tests/callbacks/test_callbacks.py |   4 +-
 tests/models/test_hooks.py        | 494 +++++++++++++++++-------------
 2 files changed, 283 insertions(+), 215 deletions(-)

diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 9b048e022c45b..a22e72ce09184 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -83,8 +83,6 @@ def test_trainer_callback_hook_system_fit(_, tmpdir):
         call.on_after_backward(trainer, model),
         call.on_train_batch_end(trainer, model, ANY, ANY, 2, 0),
         call.on_batch_end(trainer, model),
-        call.on_train_epoch_end(trainer, model, ANY),
-        call.on_epoch_end(trainer, model),
         call.on_validation_start(trainer, model),
         call.on_epoch_start(trainer, model),
         call.on_validation_epoch_start(trainer, model),
@@ -94,6 +92,8 @@ def test_trainer_callback_hook_system_fit(_, tmpdir):
         call.on_epoch_end(trainer, model),
         call.on_validation_end(trainer, model),
         call.on_save_checkpoint(trainer, model),  # should take ANY but we are inspecting signature for BC
+        call.on_train_epoch_end(trainer, model, ANY),
+        call.on_epoch_end(trainer, model),
         call.on_train_end(trainer, model),
         call.on_fit_end(trainer, model),
         call.teardown(trainer, model, 'fit'),
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 789afad2909cc..f549b5d2d8ecc 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -249,206 +249,227 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
         assert trainer.global_step == (batch_idx_ + 1) * max_epochs
 
 
-def test_trainer_model_hook_system(tmpdir):
-    """Test the LightningModule hook system."""
-
-    class HookedModel(BoringModel):
-
-        def __init__(self):
-            super().__init__()
-            self.called = []
-
-        def training_step(self, *args, **kwargs):
-            self.called.append("training_step")
-            return super().training_step(*args, **kwargs)
-
-        def optimizer_zero_grad(self, *args, **kwargs):
-            self.called.append("optimizer_zero_grad")
-            super().optimizer_zero_grad(*args, **kwargs)
-
-        def backward(self, *args, **kwargs):
-            self.called.append("backward")
-            super().backward(*args, **kwargs)
-
-        def optimizer_step(self, *args, **kwargs):
-            super().optimizer_step(*args, **kwargs)
-            self.called.append("optimizer_step")  # append after as closure calls other methods
-
-        def training_epoch_end(self, *args, **kwargs):
-            self.called.append("training_epoch_end")
-            super().training_epoch_end(*args, **kwargs)
-
-        def on_after_backward(self):
-            self.called.append("on_after_backward")
-            super().on_after_backward()
-
-        def on_before_zero_grad(self, *args, **kwargs):
-            self.called.append("on_before_zero_grad")
-            super().on_before_zero_grad(*args, **kwargs)
-
-        def on_epoch_start(self):
-            self.called.append("on_epoch_start")
-            super().on_epoch_start()
-
-        def on_epoch_end(self):
-            self.called.append("on_epoch_end")
-            super().on_epoch_end()
-
-        def on_fit_start(self):
-            self.called.append("on_fit_start")
-            super().on_fit_start()
-
-        def on_fit_end(self):
-            self.called.append("on_fit_end")
-            super().on_fit_end()
+class HookedModel(BoringModel):
+
+    def __init__(self):
+        super().__init__()
+        self.called = []
+        self.train_batch = [
+            'on_train_batch_start',
+            'on_before_batch_transfer',
+            'transfer_batch_to_device',
+            'on_after_batch_transfer',
+            'training_step',
+            'on_before_zero_grad',
+            'optimizer_zero_grad',
+            'backward',
+            'on_after_backward',
+            'optimizer_step',
+            'on_train_batch_end',
+        ]
+        self.val_batch = [
+            'on_validation_batch_start',
+            'on_before_batch_transfer',
+            'transfer_batch_to_device',
+            'on_after_batch_transfer',
+            'on_validation_batch_end',
+        ]
+
+    def training_step(self, *args, **kwargs):
+        self.called.append("training_step")
+        return super().training_step(*args, **kwargs)
+
+    def on_before_zero_grad(self, *args, **kwargs):
+        self.called.append("on_before_zero_grad")
+        super().on_before_zero_grad(*args, **kwargs)
+
+    def optimizer_zero_grad(self, *args, **kwargs):
+        self.called.append("optimizer_zero_grad")
+        super().optimizer_zero_grad(*args, **kwargs)
+
+    def training_epoch_end(self, *args, **kwargs):
+        self.called.append("training_epoch_end")
+        super().training_epoch_end(*args, **kwargs)
+
+    def backward(self, *args, **kwargs):
+        self.called.append("backward")
+        super().backward(*args, **kwargs)
+
+    def on_after_backward(self):
+        self.called.append("on_after_backward")
+        super().on_after_backward()
+
+    def optimizer_step(self, *args, **kwargs):
+        super().optimizer_step(*args, **kwargs)
+        self.called.append("optimizer_step")  # append after as closure calls other methods
+
+    def validation_epoch_end(self, *args, **kwargs):
+        self.called.append("validation_epoch_end")
+        super().validation_epoch_end(*args, **kwargs)
+
+    def on_epoch_start(self):
+        self.called.append("on_epoch_start")
+        super().on_epoch_start()
+
+    def on_epoch_end(self):
+        self.called.append("on_epoch_end")
+        super().on_epoch_end()
+
+    def on_fit_start(self):
+        self.called.append("on_fit_start")
+        super().on_fit_start()
+
+    def on_fit_end(self):
+        self.called.append("on_fit_end")
+        super().on_fit_end()
+
+    def on_hpc_load(self, *args, **kwargs):
+        self.called.append("on_hpc_load")
+        super().on_hpc_load(*args, **kwargs)
+
+    def on_hpc_save(self, *args, **kwargs):
+        self.called.append("on_hpc_save")
+        super().on_hpc_save(*args, **kwargs)
+
+    def on_load_checkpoint(self, *args, **kwargs):
+        self.called.append("on_load_checkpoint")
+        super().on_load_checkpoint(*args, **kwargs)
+
+    def on_save_checkpoint(self, *args, **kwargs):
+        self.called.append("on_save_checkpoint")
+        super().on_save_checkpoint(*args, **kwargs)
+
+    def on_pretrain_routine_start(self):
+        self.called.append("on_pretrain_routine_start")
+        super().on_pretrain_routine_start()
+
+    def on_pretrain_routine_end(self):
+        self.called.append("on_pretrain_routine_end")
+        super().on_pretrain_routine_end()
+
+    def on_train_start(self):
+        self.called.append("on_train_start")
+        super().on_train_start()
+
+    def on_train_end(self):
+        self.called.append("on_train_end")
+        super().on_train_end()
+
+    def on_before_batch_transfer(self, *args, **kwargs):
+        self.called.append("on_before_batch_transfer")
+        return super().on_before_batch_transfer(*args, **kwargs)
+
+    def transfer_batch_to_device(self, *args, **kwargs):
+        self.called.append("transfer_batch_to_device")
+        return super().transfer_batch_to_device(*args, **kwargs)
+
+    def on_after_batch_transfer(self, *args, **kwargs):
+        self.called.append("on_after_batch_transfer")
+        return super().on_after_batch_transfer(*args, **kwargs)
 
-        def on_hpc_load(self, *args, **kwargs):
-            self.called.append("on_hpc_load")
-            super().on_hpc_load(*args, **kwargs)
-
-        def on_hpc_save(self, *args, **kwargs):
-            self.called.append("on_hpc_save")
-            super().on_hpc_save(*args, **kwargs)
-
-        def on_load_checkpoint(self, *args, **kwargs):
-            self.called.append("on_load_checkpoint")
-            super().on_load_checkpoint(*args, **kwargs)
-
-        def on_save_checkpoint(self, *args, **kwargs):
-            self.called.append("on_save_checkpoint")
-            super().on_save_checkpoint(*args, **kwargs)
-
-        def on_pretrain_routine_start(self):
-            self.called.append("on_pretrain_routine_start")
-            super().on_pretrain_routine_start()
-
-        def on_pretrain_routine_end(self):
-            self.called.append("on_pretrain_routine_end")
-            super().on_pretrain_routine_end()
-
-        def on_train_start(self):
-            self.called.append("on_train_start")
-            super().on_train_start()
-
-        def on_train_end(self):
-            self.called.append("on_train_end")
-            super().on_train_end()
-
-        def on_before_batch_transfer(self, *args, **kwargs):
-            self.called.append("on_before_batch_transfer")
-            return super().on_before_batch_transfer(*args, **kwargs)
-
-        def transfer_batch_to_device(self, *args, **kwargs):
-            self.called.append("transfer_batch_to_device")
-            return super().transfer_batch_to_device(*args, **kwargs)
-
-        def on_after_batch_transfer(self, *args, **kwargs):
-            self.called.append("on_after_batch_transfer")
-            return super().on_after_batch_transfer(*args, **kwargs)
+    def on_train_batch_start(self, *args, **kwargs):
+        self.called.append("on_train_batch_start")
+        super().on_train_batch_start(*args, **kwargs)
 
-        def on_train_batch_start(self, *args, **kwargs):
-            self.called.append("on_train_batch_start")
-            super().on_train_batch_start(*args, **kwargs)
+    def on_train_batch_end(self, *args, **kwargs):
+        self.called.append("on_train_batch_end")
+        super().on_train_batch_end(*args, **kwargs)
 
-        def on_train_batch_end(self, *args, **kwargs):
-            self.called.append("on_train_batch_end")
-            super().on_train_batch_end(*args, **kwargs)
+    def on_train_epoch_start(self):
+        self.called.append("on_train_epoch_start")
+        super().on_train_epoch_start()
 
-        def on_train_epoch_start(self):
-            self.called.append("on_train_epoch_start")
-            super().on_train_epoch_start()
+    def on_train_epoch_end(self):
+        self.called.append("on_train_epoch_end")
+        super().on_train_epoch_end()
 
-        def on_train_epoch_end(self):
-            self.called.append("on_train_epoch_end")
-            super().on_train_epoch_end()
+    def on_validation_start(self):
+        self.called.append("on_validation_start")
+        super().on_validation_start()
 
-        def on_validation_start(self):
-            self.called.append("on_validation_start")
-            super().on_validation_start()
+    def on_validation_end(self):
+        self.called.append("on_validation_end")
+        super().on_validation_end()
 
-        def on_validation_end(self):
-            self.called.append("on_validation_end")
-            super().on_validation_end()
+    def on_validation_batch_start(self, *args, **kwargs):
+        self.called.append("on_validation_batch_start")
+        super().on_validation_batch_start(*args, **kwargs)
 
-        def on_validation_batch_start(self, *args, **kwargs):
-            self.called.append("on_validation_batch_start")
-            super().on_validation_batch_start(*args, **kwargs)
+    def on_validation_batch_end(self, *args, **kwargs):
+        self.called.append("on_validation_batch_end")
+        super().on_validation_batch_end(*args, **kwargs)
 
-        def on_validation_batch_end(self, *args, **kwargs):
-            self.called.append("on_validation_batch_end")
-            super().on_validation_batch_end(*args, **kwargs)
+    def on_validation_epoch_start(self):
+        self.called.append("on_validation_epoch_start")
+        super().on_validation_epoch_start()
 
-        def on_validation_epoch_start(self):
-            self.called.append("on_validation_epoch_start")
-            super().on_validation_epoch_start()
+    def on_validation_epoch_end(self, *args, **kwargs):
+        self.called.append("on_validation_epoch_end")
+        super().on_validation_epoch_end(*args, **kwargs)
 
-        def on_validation_epoch_end(self, *args, **kwargs):
-            self.called.append("on_validation_epoch_end")
-            super().on_validation_epoch_end(*args, **kwargs)
+    def on_test_start(self):
+        self.called.append("on_test_start")
+        super().on_test_start()
 
-        def on_test_start(self):
-            self.called.append("on_test_start")
-            super().on_test_start()
+    def on_test_batch_start(self, *args, **kwargs):
+        self.called.append("on_test_batch_start")
+        super().on_test_batch_start(*args, **kwargs)
 
-        def on_test_batch_start(self, *args, **kwargs):
-            self.called.append("on_test_batch_start")
-            super().on_test_batch_start(*args, **kwargs)
+    def on_test_batch_end(self, *args, **kwargs):
+        self.called.append("on_test_batch_end")
+        super().on_test_batch_end(*args, **kwargs)
 
-        def on_test_batch_end(self, *args, **kwargs):
-            self.called.append("on_test_batch_end")
-            super().on_test_batch_end(*args, **kwargs)
+    def on_test_epoch_start(self):
+        self.called.append("on_test_epoch_start")
+        super().on_test_epoch_start()
 
-        def on_test_epoch_start(self):
-            self.called.append("on_test_epoch_start")
-            super().on_test_epoch_start()
+    def on_test_epoch_end(self, *args, **kwargs):
+        self.called.append("on_test_epoch_end")
+        super().on_test_epoch_end(*args, **kwargs)
 
-        def on_test_epoch_end(self, *args, **kwargs):
-            self.called.append("on_test_epoch_end")
-            super().on_test_epoch_end(*args, **kwargs)
+    def on_validation_model_eval(self):
+        self.called.append("on_validation_model_eval")
+        super().on_validation_model_eval()
 
-        def on_validation_model_eval(self):
-            self.called.append("on_validation_model_eval")
-            super().on_validation_model_eval()
+    def on_validation_model_train(self):
+        self.called.append("on_validation_model_train")
+        super().on_validation_model_train()
 
-        def on_validation_model_train(self):
-            self.called.append("on_validation_model_train")
-            super().on_validation_model_train()
+    def on_test_model_eval(self):
+        self.called.append("on_test_model_eval")
+        super().on_test_model_eval()
 
-        def on_test_model_eval(self):
-            self.called.append("on_test_model_eval")
-            super().on_test_model_eval()
+    def on_test_model_train(self):
+        self.called.append("on_test_model_train")
+        super().on_test_model_train()
 
-        def on_test_model_train(self):
-            self.called.append("on_test_model_train")
-            super().on_test_model_train()
+    def on_test_end(self):
+        self.called.append("on_test_end")
+        super().on_test_end()
 
-        def on_test_end(self):
-            self.called.append("on_test_end")
-            super().on_test_end()
+    def setup(self, stage=None):
+        self.called.append(f"setup_{stage}")
+        super().setup(stage=stage)
 
-        def setup(self, stage=None):
-            self.called.append(f"setup_{stage}")
-            super().setup(stage=stage)
+    def teardown(self, stage=None):
+        self.called.append(f"teardown_{stage}")
+        super().teardown(stage)
 
-        def teardown(self, stage=None):
-            self.called.append(f"teardown_{stage}")
-            super().teardown(stage)
 
+def test_trainer_model_hook_system_fit(tmpdir):
+    """Test the LightningModule hook system."""
     model = HookedModel()
-
-    # fit model
+    train_batches = 2
+    val_batches = 2
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
-        limit_val_batches=1,
-        limit_train_batches=2,
-        limit_test_batches=1,
+        limit_train_batches=train_batches,
+        limit_val_batches=val_batches,
         progress_bar_refresh_rate=0,
         weights_summary=None,
     )
-
     assert model.called == []
-
     trainer.fit(model)
     expected = [
         'setup_fit',
@@ -459,11 +480,8 @@ def teardown(self, stage=None):
         'on_validation_start',
         'on_epoch_start',
         'on_validation_epoch_start',
-        'on_validation_batch_start',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'on_validation_batch_end',
+        *(model.val_batch * val_batches),
+        'validation_epoch_end',
         'on_validation_epoch_end',
         'on_epoch_end',
         'on_validation_end',
@@ -471,37 +489,13 @@ def teardown(self, stage=None):
         'on_train_start',
         'on_epoch_start',
         'on_train_epoch_start',
-        'on_train_batch_start',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'training_step',
-        'on_before_zero_grad',
-        'optimizer_zero_grad',
-        'backward',
-        'on_after_backward',
-        'optimizer_step',
-        'on_train_batch_end',
-        'on_train_batch_start',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'training_step',
-        'on_before_zero_grad',
-        'optimizer_zero_grad',
-        'backward',
-        'on_after_backward',
-        'optimizer_step',
-        'on_train_batch_end',
+        *(model.train_batch * train_batches),
         'on_validation_model_eval',
         'on_validation_start',
         'on_epoch_start',
         'on_validation_epoch_start',
-        'on_validation_batch_start',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'on_validation_batch_end',
+        *(model.val_batch * val_batches),
+        'validation_epoch_end',
         'on_validation_epoch_end',
         'on_epoch_end',
         'on_save_checkpoint',
@@ -516,8 +510,52 @@ def teardown(self, stage=None):
     ]
     assert model.called == expected
 
+
+def test_trainer_model_hook_system_fit_no_val(tmpdir):
     model = HookedModel()
+    train_batches = 2
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_val_batches=0,
+        limit_train_batches=train_batches,
+        progress_bar_refresh_rate=0,
+        weights_summary=None,
+    )
+    assert model.called == []
 
+    trainer.fit(model)
+    expected = [
+        'setup_fit',
+        'on_fit_start',
+        'on_pretrain_routine_start',
+        'on_pretrain_routine_end',
+        'on_train_start',
+        'on_epoch_start',
+        'on_train_epoch_start',
+        *(model.train_batch * train_batches),
+        'training_epoch_end',
+        'on_train_epoch_end',
+        'on_epoch_end',
+        'on_save_checkpoint',  # from train epoch end
+        'on_save_checkpoint',  # from on train end
+        'on_train_end',
+        'on_fit_end',
+        'teardown_fit',
+    ]
+    assert model.called == expected
+
+
+def test_trainer_model_hook_system_validate(tmpdir):
+    model = HookedModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_val_batches=1,
+        progress_bar_refresh_rate=0,
+        weights_summary=None,
+    )
+    assert model.called == []
     trainer.validate(model, verbose=False)
     expected = [
         'setup_validate',
@@ -530,6 +568,7 @@ def teardown(self, stage=None):
         'transfer_batch_to_device',
         'on_after_batch_transfer',
         'on_validation_batch_end',
+        'validation_epoch_end',
         'on_validation_epoch_end',
         'on_epoch_end',
         'on_validation_end',
@@ -538,9 +577,18 @@ def teardown(self, stage=None):
     ]
     assert model.called == expected
 
+
+def test_trainer_model_hook_system_test(tmpdir):
     model = HookedModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_test_batches=1,
+        progress_bar_refresh_rate=0,
+        weights_summary=None,
+    )
+    assert model.called == []
     trainer.test(model, verbose=False)
-
     expected = [
         'setup_test',
         'on_test_model_eval',
@@ -676,30 +724,50 @@ def on_after_batch_transfer(self, *args, **kwargs):
         reload_dataloaders_every_epoch=True,
     )
     trainer.fit(model, datamodule=dm)
-
     expected = [
-        'prepare_data', 'setup_fit', 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device',
-        'on_after_batch_transfer', 'train_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device',
-        'on_after_batch_transfer', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer',
-        'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer',
-        'teardown_fit'
+        'prepare_data',
+        'setup_fit',
+        'val_dataloader',
+        'on_before_batch_transfer',
+        'transfer_batch_to_device',
+        'on_after_batch_transfer',
+        'train_dataloader',
+        'on_before_batch_transfer',
+        'transfer_batch_to_device',
+        'on_after_batch_transfer',
+        'on_before_batch_transfer',
+        'transfer_batch_to_device',
+        'on_after_batch_transfer',
+        'val_dataloader',
+        'on_before_batch_transfer',
+        'transfer_batch_to_device',
+        'on_after_batch_transfer',
+        'teardown_fit',
     ]
     assert dm.called == expected
 
     dm = HookedDataModule()
     trainer.validate(model, datamodule=dm, verbose=False)
-
     expected = [
-        'prepare_data', 'setup_validate', 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device',
-        'on_after_batch_transfer', 'teardown_validate'
+        'prepare_data',
+        'setup_validate',
+        'val_dataloader',
+        'on_before_batch_transfer',
+        'transfer_batch_to_device',
+        'on_after_batch_transfer',
+        'teardown_validate',
     ]
     assert dm.called == expected
 
     dm = HookedDataModule()
     trainer.test(model, datamodule=dm, verbose=False)
-
     expected = [
-        'prepare_data', 'setup_test', 'test_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device',
-        'on_after_batch_transfer', 'teardown_test'
+        'prepare_data',
+        'setup_test',
+        'test_dataloader',
+        'on_before_batch_transfer',
+        'transfer_batch_to_device',
+        'on_after_batch_transfer',
+        'teardown_test',
     ]
     assert dm.called == expected

From c8500d7a2388e57cdf153bd8f67f4ad69a97caad Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 24 May 2021 17:35:54 +0200
Subject: [PATCH 07/32] Fix tests

---
 pytorch_lightning/trainer/training_loop.py | 29 +++++++++++-----------
 tests/models/test_hooks.py                 |  5 ++--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 7707e40d5a611..d9148be8f551e 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -14,7 +14,7 @@
 
 from collections import OrderedDict
 from contextlib import contextmanager, suppress
-from copy import copy, deepcopy
+from copy import copy
 from functools import partial, update_wrapper
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -34,7 +34,6 @@
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.parsing import AttributeDict
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
-from pytorch_lightning.utilities.types import _METRIC
 from pytorch_lightning.utilities.warnings import WarningCache
 
 
@@ -874,30 +873,30 @@ def should_accumulate(self):
         is_final_batch = self._num_training_batches_reached()
         return not (accumulation_done or is_final_batch)
 
-    def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool, on_epoch: bool = False) -> bool:
+    def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool:
         """ Decide if we should run validation. """
-
         if not self.trainer.enable_validation:
             return False
 
-        # check if this epoch is eligible to run validation
-        if (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch != 0:
+        is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch != 1
+        if not is_val_check_epoch:
             return False
 
+        is_infinite_dataset = self.trainer.val_check_batch == float('inf')
+        if is_last_batch and is_infinite_dataset:
+            return True
+
+        if self.trainer.should_stop:
+            return True
+
         # val_check_batch is inf for iterable datasets with no length defined
         # TODO: let training/eval loop handle logic around limit_*_batches and val_check_batch
-        is_val_check_batch = False
-        if isinstance(self.trainer.limit_train_batches, int) and self.trainer.val_check_batch == float('inf'):
+        is_val_check_batch = is_last_batch
+        if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset:
             is_val_check_batch = (batch_idx + 1) % self.trainer.limit_train_batches == 0
         elif self.trainer.val_check_batch != float('inf'):
             is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0
-
-        is_last_batch_for_infinite_dataset = is_last_batch and self.trainer.val_check_batch == float("inf")
-
-        if on_epoch:
-            return is_val_check_batch or self.trainer.should_stop or is_last_batch_for_infinite_dataset
-        else:
-            return is_val_check_batch
+        return is_val_check_batch
 
     def _build_kwargs(self, batch, batch_idx, opt_idx, hiddens):
         # enable not needing to add opt_idx to training_step
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index f549b5d2d8ecc..18e086f44e329 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -229,7 +229,7 @@ def train_dataloader(self):
     trainer.fit(model)
 
 
-@pytest.mark.parametrize('max_epochs,batch_idx_', [(2, 5), (3, 8), (4, 12)])
+@pytest.mark.parametrize('max_epochs,batch_idx_', [(2, 5), (3, 8), (4, 70)])
 def test_on_train_batch_start_hook(max_epochs, batch_idx_):
 
     class CurrentModel(BoringModel):
@@ -241,12 +241,13 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
     model = CurrentModel()
     trainer = Trainer(max_epochs=max_epochs)
     trainer.fit(model)
+    assert len(model.val_dataloader()) < 70
     if batch_idx_ > len(model.val_dataloader()) - 1:
         assert trainer.train_loop.batch_idx == len(model.val_dataloader()) - 1
         assert trainer.global_step == len(model.val_dataloader()) * max_epochs
     else:
         assert trainer.train_loop.batch_idx == batch_idx_
-        assert trainer.global_step == (batch_idx_ + 1) * max_epochs
+        assert trainer.global_step == batch_idx_ * max_epochs
 
 
 class HookedModel(BoringModel):

From 0c2305dc9d032cb357d553b017ebf75beec03182 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 24 May 2021 17:37:07 +0200
Subject: [PATCH 08/32] Minor change

---
 pytorch_lightning/trainer/training_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index d9148be8f551e..383ed3102d462 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -878,7 +878,7 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool:
         if not self.trainer.enable_validation:
             return False
 
-        is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch != 1
+        is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch == 0
         if not is_val_check_epoch:
             return False
 

From f02a866dcc125afe438b928dde6c97b4ea78b256 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 01:40:53 +0200
Subject: [PATCH 09/32] Fix test

---
 pytorch_lightning/trainer/training_loop.py | 6 ------
 tests/callbacks/test_early_stopping.py     | 4 ++--
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 383ed3102d462..ac29088f838c2 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -539,9 +539,6 @@ def run_training_epoch(self):
             # dataloader/iterator did not produce a batch
             return
 
-        # TODO
-        # self.global_step -= 1
-
         # handle epoch_output on epoch end
         self.on_train_epoch_end(epoch_output)
 
@@ -556,9 +553,6 @@ def run_training_epoch(self):
             # if validation has run, this should have been called already
             self.check_checkpoint_callback(True)
 
-        # TODO
-        # self.global_step += 1
-
     def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None:
         # inform logger the batch loop has finished
         self.trainer.logger_connector.on_train_epoch_end()
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 8f89bedeb4f38..7d303e6ed00d6 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -157,7 +157,7 @@ def test_early_stopping_patience_train(
     """Test to ensure that early stopping is not triggered before patience is exhausted."""
 
     class ModelOverrideTrainReturn(BoringModel):
-        train_return_values = torch.Tensor(loss_values)
+        train_return_values = torch.tensor(loss_values)
 
         def training_epoch_end(self, outputs):
             loss = self.train_return_values[self.current_epoch]
@@ -169,7 +169,7 @@ def training_epoch_end(self, outputs):
         model.validation_step = None
 
     early_stop_callback = EarlyStopping(
-        monitor="train_loss", patience=patience, verbose=True, check_on_train_epoch_end=validation_step_none
+        monitor="train_loss", patience=patience, verbose=True, check_on_train_epoch_end=True
     )
     trainer = Trainer(
         default_root_dir=tmpdir,

From fa31597073a56e6d86cd928cb19dd4d2d54566f2 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 01:47:00 +0200
Subject: [PATCH 10/32] Increment the total batch idx before the accumulation
 early exit

---
 pytorch_lightning/trainer/training_loop.py |  4 ++--
 tests/tuner/test_lr_finder.py              | 27 ++++++++--------------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index ca50b088c665b..10b727edbc93a 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -529,6 +529,8 @@ def run_training_epoch(self):
             self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics)
             self.trainer.checkpoint_connector.has_trained = True
 
+            self.total_batch_idx += 1
+
             # max steps reached, end training
             if (
                 self.max_steps is not None and self.max_steps <= self.global_step + 1
@@ -542,8 +544,6 @@ def run_training_epoch(self):
             if self.trainer.should_stop:
                 break
 
-            self.total_batch_idx += 1
-
             # stop epoch if we limited the number of training batches
             if self._num_training_batches_reached(is_last_batch):
                 break
diff --git a/tests/tuner/test_lr_finder.py b/tests/tuner/test_lr_finder.py
index 608cb8c6778bf..d4e5ef1862020 100644
--- a/tests/tuner/test_lr_finder.py
+++ b/tests/tuner/test_lr_finder.py
@@ -197,31 +197,24 @@ def test_datamodule_parameter(tmpdir):
 
 
 def test_accumulation_and_early_stopping(tmpdir):
-    """ Test that early stopping of learning rate finder works, and that
-        accumulation also works for this feature """
+    """ Test that early stopping of learning rate finder works, and that accumulation also works for this feature """
 
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = EvalModelTemplate(**hparams)
+    class TestModel(BoringModel):
 
-    before_lr = hparams.get('learning_rate')
-    # logger file to get meta
+        def __init__(self):
+            super().__init__()
+            self.lr = 1e-3
+
+    model = TestModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
         accumulate_grad_batches=2,
     )
-
     lrfinder = trainer.tuner.lr_find(model, early_stop_threshold=None)
-    after_lr = lrfinder.suggestion()
 
-    expected_num_lrs = 100
-    expected_batch_idx = 200 - 1
-
-    assert before_lr != after_lr, \
-        'Learning rate was not altered after running learning rate finder'
-    assert len(lrfinder.results['lr']) == expected_num_lrs, \
-        'Early stopping for learning rate finder did not work'
-    assert lrfinder._total_batch_idx == expected_batch_idx, \
-        'Accumulation parameter did not work'
+    assert 1e-3 != lrfinder.suggestion()
+    assert len(lrfinder.results['lr']) == 100
+    assert lrfinder._total_batch_idx == 200
 
 
 def test_suggestion_parameters_work(tmpdir):

From 64c49c11346cc383af36c70eca8ba9424e45e663 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 01:52:12 +0200
Subject: [PATCH 11/32] Update CHANGELOG

---
 CHANGELOG.md                  | 3 +++
 tests/tuner/test_lr_finder.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6ccb83f41ca84..14dbee08920be 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -127,6 +127,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed global step update when the epoch is skipped ([#7677](https://github.com/PyTorchLightning/pytorch-lightning/pull/7677))
 
 
+- Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692))
+
+
 - Fixed broadcasting in multi-node, multi-gpu DDP using torch 1.7 ([#7592](https://github.com/PyTorchLightning/pytorch-lightning/pull/7592))
 
 
diff --git a/tests/tuner/test_lr_finder.py b/tests/tuner/test_lr_finder.py
index d4e5ef1862020..a74af3862c473 100644
--- a/tests/tuner/test_lr_finder.py
+++ b/tests/tuner/test_lr_finder.py
@@ -212,7 +212,7 @@ def __init__(self):
     )
     lrfinder = trainer.tuner.lr_find(model, early_stop_threshold=None)
 
-    assert 1e-3 != lrfinder.suggestion()
+    assert lrfinder.suggestion() != 1e-3
     assert len(lrfinder.results['lr']) == 100
     assert lrfinder._total_batch_idx == 200
 

From a3d328f8fc61ac576c9a202a0d42e2c5434a7cf7 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 02:27:03 +0200
Subject: [PATCH 12/32] Fix test

---
 pytorch_lightning/trainer/training_loop.py |  6 +++
 tests/loggers/test_tensorboard.py          | 49 ++++++----------------
 2 files changed, 18 insertions(+), 37 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index ac29088f838c2..dce38ef3f82c6 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -542,8 +542,14 @@ def run_training_epoch(self):
         # handle epoch_output on epoch end
         self.on_train_epoch_end(epoch_output)
 
+        # the global step is manually decreased here due to backwards compatibility with existing loggers
+        # as they expect that the same step is used for the following epoch_end calls even when the batch loop has
+        # finished. this means the attribute does not exactly track the number of optimizer steps applied.
+        # TODO(@carmocca): deprecate and rename so users don't get confused
+        self.global_step -= 1
         # log epoch metrics
         self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output)
+        self.global_step += 1
 
         self.update_lr_schedulers('epoch')
 
diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py
index f22cdcfe2bba4..ed16bfba8c5ba 100644
--- a/tests/loggers/test_tensorboard.py
+++ b/tests/loggers/test_tensorboard.py
@@ -264,67 +264,42 @@ def test_tensorboard_log_graph_warning_no_example_input_array(tmpdir):
 
 
 @mock.patch('pytorch_lightning.loggers.TensorBoardLogger.log_metrics')
-@pytest.mark.parametrize('expected', [
-    ([5, 11, 17]),
-])
-def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmpdir):
-    """
-    Tests to ensure that tensorboard log properly when accumulated_gradients > 1
-    """
+def test_tensorboard_with_a3ccummulated_gradients(mock_log_metrics, tmpdir):
+    """Tests to ensure that tensorboard log properly when accumulated_gradients > 1"""
 
     class TestModel(BoringModel):
 
         def __init__(self):
             super().__init__()
-            self._count = 0
-            self._indexes = []
-
-        def training_step(self, batch, batch_idx):
-            output = self.layer(batch)
-            loss = self.loss(batch, output)
-            self.log('count', self._count, on_step=True, on_epoch=True)
-            self.log('loss', loss, on_step=True, on_epoch=True)
+            self.indexes = []
 
+        def training_step(self, *args):
+            self.log('foo', 1, on_step=True, on_epoch=True)
             if not self.trainer.train_loop.should_accumulate():
                 if self.trainer.logger_connector.should_update_logs:
-                    self._indexes.append(self.trainer.global_step)
-
-            return loss
-
-        def validation_step(self, batch, batch_idx):
-            output = self.layer(batch)
-            loss = self.loss(batch, output)
-            self.log('val_loss', loss, on_step=True, on_epoch=True)
-            return loss
-
-        def configure_optimizers(self):
-            optimizer = torch.optim.SGD(self.layer.parameters(), lr=.001)
-            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
-            return [optimizer], [lr_scheduler]
+                    self.indexes.append(self.trainer.global_step)
+            return super().training_step(*args)
 
     model = TestModel()
     model.training_epoch_end = None
-    model.validation_epoch_end = None
-
     logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False)
-
     trainer = Trainer(
         default_root_dir=tmpdir,
         limit_train_batches=12,
         limit_val_batches=0,
         max_epochs=3,
-        gpus=0,
         accumulate_grad_batches=2,
         logger=[logger_0],
         log_every_n_steps=3,
     )
     trainer.fit(model)
 
-    mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]]
-    assert mock_count_epochs == expected
+    calls = [m[2] for m in mock_log_metrics.mock_calls]
+    count_epochs = [c["step"] for c in calls if "foo_epoch" in c["metrics"]]
+    assert count_epochs == [5, 11, 17]
 
-    mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]]
-    assert model._indexes == mock_count_steps
+    count_steps = [c["step"] for c in calls if "foo_step" in c["metrics"]]
+    assert count_steps == model.indexes
 
 
 @mock.patch('pytorch_lightning.loggers.tensorboard.SummaryWriter')

From 5a00cec4755e9b8a5eb6f7d42c92168bf1f5870b Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 02:30:13 +0200
Subject: [PATCH 13/32] Comment

---
 pytorch_lightning/trainer/training_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index dce38ef3f82c6..12be2ce067524 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -543,7 +543,7 @@ def run_training_epoch(self):
         self.on_train_epoch_end(epoch_output)
 
         # the global step is manually decreased here due to backwards compatibility with existing loggers
-        # as they expect that the same step is used for the following epoch_end calls even when the batch loop has
+        # as they expect that the same step is used when logging epoch end metrics even when the batch loop has
         # finished. this means the attribute does not exactly track the number of optimizer steps applied.
         # TODO(@carmocca): deprecate and rename so users don't get confused
         self.global_step -= 1

From ca7804fb9c3cd9952509277153efabf93a6054ce Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 03:39:13 +0200
Subject: [PATCH 14/32] Fix test

---
 tests/checkpointing/test_model_checkpoint.py | 45 +++++++++-----------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 3d7a35917e095..0bdba7ce0c3da 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -180,23 +180,21 @@ def test_model_checkpoint_score_and_ckpt_val_check_interval(
     max_epochs = 3
     limit_train_batches = 12
     limit_val_batches = 7
-    lr = 1e-1
+    lr, gamma = 1e-1, 2
     monitor = 'val_log'
-    per_epoch_steps = int(limit_train_batches * val_check_interval)
-    per_epoch_call_count = limit_train_batches // per_epoch_steps
-    left_over_steps = limit_train_batches % per_epoch_steps
+    per_epoch_val_checks = int(limit_train_batches * val_check_interval)
+    per_val_check_steps, leftover_steps = divmod(limit_train_batches, per_epoch_val_checks)
 
     class CustomBoringModel(BoringModel):
 
         def __init__(self):
             super().__init__()
-            self.val_logs = torch.randn(per_epoch_call_count * max_epochs, limit_val_batches)
+            self.val_logs = torch.randn(per_val_check_steps * max_epochs, limit_val_batches)
             self.val_loop_count = 0
 
         def validation_step(self, batch, batch_idx):
             log_value = self.val_logs[self.val_loop_count, batch_idx]
             self.log('val_log', log_value)
-            self.log('epoch', self.current_epoch, on_epoch=True)
             return super().validation_step(batch, batch_idx)
 
         def validation_epoch_end(self, outputs):
@@ -213,7 +211,7 @@ def configure_optimizers(self):
                     'strict': True,
                 }
             else:
-                lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1)
+                lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)
 
             return [optimizer], [lr_scheduler]
 
@@ -241,26 +239,27 @@ def configure_optimizers(self):
 
     # on_train_end ckpt callback is called which creates an additional ckpt in case no ckpt is created at the
     # end of epoch, thus if val_check_interval doesn't align with the training steps we create an additional ckpt
-    additional_ckpt, additional_ckpt_path = 0, None
+    additional_ckpt, additional_ckpt_path = False, None
     if not epoch_aligned:
         additional_ckpt_path = [f for f in ckpt_files if 'v1' in f.stem][0]
-        additional_ckpt = 1
+        additional_ckpt = True
 
-    additional_ckpt = 1 if not epoch_aligned else 0
-    assert len(ckpt_files) == len(scores) + additional_ckpt == per_epoch_call_count * max_epochs + additional_ckpt
+    assert len(ckpt_files) == len(scores) + additional_ckpt == per_val_check_steps * max_epochs + additional_ckpt
     assert len(lr_scheduler_debug) == max_epochs
 
-    def _make_assertions(epoch, ix, add=''):
-        global_ix = ix + per_epoch_call_count * epoch
+    def _make_assertions(epoch, ix, version=''):
+        global_ix = ix + per_val_check_steps * epoch
+        duplicated = bool(version)
+
         score = scores[global_ix]
         expected_score = getattr(model, f'{monitor}s')[global_ix].mean().item()
-        expected_filename = f'{monitor}={score:.4f}-epoch={epoch}{add}.ckpt'
+        expected_filename = f'{monitor}={score:.4f}-epoch={epoch}{version}.ckpt'
         assert math.isclose(score, expected_score, rel_tol=1e-4)
 
         chk = pl_load(os.path.join(checkpoint.dirpath, expected_filename))
         assert chk['epoch'] == epoch + 1
-        epoch_num = epoch + (1 if add else 0)
-        expected_global_step = per_epoch_steps * (global_ix + 1) + (left_over_steps * epoch_num)
+        epoch_num = epoch + duplicated
+        expected_global_step = per_epoch_val_checks * (global_ix + 1) + (leftover_steps * epoch_num)
         assert chk['global_step'] == expected_global_step
 
         mc_specific_data = chk['callbacks'][type(checkpoint)]
@@ -269,15 +268,15 @@ def _make_assertions(epoch, ix, add=''):
         assert mc_specific_data['current_score'] == score
 
         if not reduce_lr_on_plateau:
-            lr_scheduler_specific_data = chk['lr_schedulers'][0]
-            did_update = 1 if (ix + 1 == per_epoch_call_count) and (epoch_aligned or add) else 0
-            assert lr_scheduler_specific_data['_step_count'] == epoch + 1 + did_update
-            assert lr_scheduler_specific_data['_last_lr'][0] == lr * (lr**(epoch + did_update))
+            actual_step_count = chk['lr_schedulers'][0]['_step_count']
+            actual_lr = chk['lr_schedulers'][0]['_last_lr'][0]
+            assert actual_step_count == epoch + 1 + duplicated
+            assert actual_lr == lr * gamma**(epoch + duplicated)
 
         return score
 
     for epoch in range(max_epochs):
-        for i in range(per_epoch_call_count):
+        for i in range(per_val_check_steps):
             score = _make_assertions(epoch, i)
 
         assert lr_scheduler_debug[epoch]['monitor_val'] == (score if reduce_lr_on_plateau else None)
@@ -285,9 +284,7 @@ def _make_assertions(epoch, ix, add=''):
 
     # check the ckpt file saved on_train_end
     if additional_ckpt_path:
-        epoch = max_epochs - 1
-        i = per_epoch_call_count - 1
-        _make_assertions(epoch, i, add='-v1')
+        _make_assertions(max_epochs - 1, per_val_check_steps - 1, version='-v1')
 
 
 @pytest.mark.parametrize("save_top_k", [-1, 0, 1, 2])

From d91eaf4b55cf6f2ba7199453bdb9a213ca7f613c Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 04:40:55 +0200
Subject: [PATCH 15/32] Fix ModelCheckpoint tests

---
 pytorch_lightning/trainer/training_loop.py   | 10 ++++++----
 tests/checkpointing/test_model_checkpoint.py | 16 +++++++++-------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 12be2ce067524..08bbfb709d5b6 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -553,11 +553,13 @@ def run_training_epoch(self):
 
         self.update_lr_schedulers('epoch')
 
-        should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches)
-        should_train_only = self.trainer.disable_validation or should_skip_eval
-        if should_train_only:
-            # if validation has run, this should have been called already
+        did_train_only = self.trainer.disable_validation or self.trainer.evaluation_loop.should_skip_evaluation(
+            self.trainer.num_val_batches
+        )
+        if did_train_only:
+            self.global_step -= 1
             self.check_checkpoint_callback(True)
+            self.global_step += 1
 
     def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None:
         # inform logger the batch loop has finished
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 0bdba7ce0c3da..f26c578151b80 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -60,9 +60,8 @@ def validation_epoch_end(self, outputs):
     "validation_step_none,val_dataloaders_none,monitor",
     [
         (False, False, 'val_log'),
-        (False, False, 'train_log_epoch'),
         (True, False, 'train_log_epoch'),
-        (False, True, 'train_log_epoch'),
+        (False, True, 'val_log'),
     ],
 )
 @pytest.mark.parametrize('reduce_lr_on_plateau', [False, True])
@@ -76,7 +75,7 @@ def test_model_checkpoint_score_and_ckpt(
     max_epochs = 3
     limit_train_batches = 5
     limit_val_batches = 7
-    lr = 1e-1
+    lr, gamma = 1e-1, 2
 
     class CustomBoringModel(BoringModel):
 
@@ -106,7 +105,7 @@ def configure_optimizers(self):
                     'strict': True,
                 }
             else:
-                lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1)
+                lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)
 
             return [optimizer], [lr_scheduler]
 
@@ -153,9 +152,12 @@ def configure_optimizers(self):
         assert mc_specific_data['current_score'] == score
 
         if not reduce_lr_on_plateau:
-            lr_scheduler_specific_data = chk['lr_schedulers'][0]
-            assert lr_scheduler_specific_data['_step_count'] == epoch + 2
-            assert lr_scheduler_specific_data['_last_lr'][0] == lr * (lr**(epoch + 1))
+            actual_step_count = chk['lr_schedulers'][0]['_step_count']
+            actual_lr = chk['lr_schedulers'][0]['_last_lr'][0]
+            # if validation_step_none, the checkpoint gets saved after the learning rate update
+            # so we need to increase the count by one
+            assert actual_step_count == epoch + 1 + validation_step_none
+            assert actual_lr == lr * gamma**(epoch + validation_step_none)
 
         assert lr_scheduler_debug[epoch]['monitor_val'] == (score if reduce_lr_on_plateau else None)
         assert lr_scheduler_debug[epoch]['monitor_key'] == (monitor if reduce_lr_on_plateau else None)

From a66b96fb5a3082ae1df73bef8279bb994ea0ac4c Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 11:59:27 +0200
Subject: [PATCH 16/32] Update test

---
 tests/models/test_hooks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 7816bc74829ea..1298248510673 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -517,7 +517,6 @@ def test_trainer_model_hook_system_fit_no_val(tmpdir):
         'on_train_epoch_end',
         'on_epoch_end',
         'on_save_checkpoint',  # from train epoch end
-        'on_save_checkpoint',  # from on train end
         'on_train_end',
         'on_fit_end',
         'teardown_fit',

From 2eb31ed3f432021379f7e158bab3177f7af80310 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 13:37:47 +0200
Subject: [PATCH 17/32] Conflicts

---
 pl_examples/bug_report_model.py              |  1 +
 pytorch_lightning/trainer/training_loop.py   | 10 ++--------
 tests/checkpointing/test_model_checkpoint.py | 16 ++++++++--------
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index abb65ba86fd93..70591f3649fd9 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -1,4 +1,5 @@
 import os
+from unittest.mock import MagicMock
 
 import torch
 from torch.utils.data import DataLoader, Dataset
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 90cbb0abfe23c..09a32c3c96aad 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -884,6 +884,7 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool:
         if not is_val_check_epoch:
             return False
 
+        # val_check_batch is inf for iterable datasets with no length defined
         is_infinite_dataset = self.trainer.val_check_batch == float('inf')
         if is_last_batch and is_infinite_dataset:
             return True
@@ -891,19 +892,12 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool:
         if self.trainer.should_stop:
             return True
 
-        # val_check_batch is inf for iterable datasets with no length defined
-        is_infinite_dataset = self.trainer.val_check_batch == float('inf')
-        if on_epoch and is_last_batch and is_infinite_dataset:
-            return True
-
-        if on_epoch and self.trainer.should_stop:
-            return True
-
         # TODO: let training/eval loop handle logic around limit_*_batches and val_check_batch
         is_val_check_batch = is_last_batch
         if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset:
             is_val_check_batch = (batch_idx + 1) % self.trainer.limit_train_batches == 0
         elif self.trainer.val_check_batch != float('inf'):
+            is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0
         return is_val_check_batch
 
     def _build_kwargs(self, batch, batch_idx, opt_idx, hiddens):
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index f26c578151b80..2f867d4e998b4 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -184,14 +184,14 @@ def test_model_checkpoint_score_and_ckpt_val_check_interval(
     limit_val_batches = 7
     lr, gamma = 1e-1, 2
     monitor = 'val_log'
-    per_epoch_val_checks = int(limit_train_batches * val_check_interval)
-    per_val_check_steps, leftover_steps = divmod(limit_train_batches, per_epoch_val_checks)
+    per_val_train_batches = int(limit_train_batches * val_check_interval)
+    per_epoch_val_checks, leftover_train_batches = divmod(limit_train_batches, per_val_train_batches)
 
     class CustomBoringModel(BoringModel):
 
         def __init__(self):
             super().__init__()
-            self.val_logs = torch.randn(per_val_check_steps * max_epochs, limit_val_batches)
+            self.val_logs = torch.randn(per_epoch_val_checks * max_epochs, limit_val_batches)
             self.val_loop_count = 0
 
         def validation_step(self, batch, batch_idx):
@@ -246,11 +246,11 @@ def configure_optimizers(self):
         additional_ckpt_path = [f for f in ckpt_files if 'v1' in f.stem][0]
         additional_ckpt = True
 
-    assert len(ckpt_files) == len(scores) + additional_ckpt == per_val_check_steps * max_epochs + additional_ckpt
+    assert len(ckpt_files) == len(scores) + additional_ckpt == per_epoch_val_checks * max_epochs + additional_ckpt
     assert len(lr_scheduler_debug) == max_epochs
 
     def _make_assertions(epoch, ix, version=''):
-        global_ix = ix + per_val_check_steps * epoch
+        global_ix = ix + per_epoch_val_checks * epoch
         duplicated = bool(version)
 
         score = scores[global_ix]
@@ -261,7 +261,7 @@ def _make_assertions(epoch, ix, version=''):
         chk = pl_load(os.path.join(checkpoint.dirpath, expected_filename))
         assert chk['epoch'] == epoch + 1
         epoch_num = epoch + duplicated
-        expected_global_step = per_epoch_val_checks * (global_ix + 1) + (leftover_steps * epoch_num)
+        expected_global_step = per_val_train_batches * (global_ix + 1) + (leftover_train_batches * epoch_num)
         assert chk['global_step'] == expected_global_step
 
         mc_specific_data = chk['callbacks'][type(checkpoint)]
@@ -278,7 +278,7 @@ def _make_assertions(epoch, ix, version=''):
         return score
 
     for epoch in range(max_epochs):
-        for i in range(per_val_check_steps):
+        for i in range(per_epoch_val_checks):
             score = _make_assertions(epoch, i)
 
         assert lr_scheduler_debug[epoch]['monitor_val'] == (score if reduce_lr_on_plateau else None)
@@ -286,7 +286,7 @@ def _make_assertions(epoch, ix, version=''):
 
     # check the ckpt file saved on_train_end
     if additional_ckpt_path:
-        _make_assertions(max_epochs - 1, per_val_check_steps - 1, version='-v1')
+        _make_assertions(max_epochs - 1, per_epoch_val_checks - 1, version='-v1')
 
 
 @pytest.mark.parametrize("save_top_k", [-1, 0, 1, 2])

From efa15296bde06ba4e8b8e34f835804a42ea97006 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 13:39:37 +0200
Subject: [PATCH 18/32] Bad merge

---
 tests/models/test_hooks.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 4ace7b89e4251..7608d890236be 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -283,6 +283,10 @@ def validation_epoch_end(self, *args, **kwargs):
         self.called.append("validation_epoch_end")
         super().validation_epoch_end(*args, **kwargs)
 
+    def on_before_zero_grad(self, *args, **kwargs):
+        self.called.append("on_before_zero_grad")
+        super().on_before_zero_grad(*args, **kwargs)
+
     def on_epoch_start(self):
         self.called.append("on_epoch_start")
         super().on_epoch_start()

From 00ccee0349ee68aea70761bf56fa1759e6c015da Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 13:40:43 +0200
Subject: [PATCH 19/32] Bad merge

---
 tests/models/test_hooks.py | 33 ---------------------------------
 1 file changed, 33 deletions(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 7608d890236be..dca56a39cd274 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -524,39 +524,6 @@ def test_trainer_model_hook_system_fit_no_val(tmpdir):
     assert model.called == expected
 
 
-def test_trainer_model_hook_system_fit_no_val(tmpdir):
-    model = HookedModel()
-    train_batches = 2
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        limit_val_batches=0,
-        limit_train_batches=train_batches,
-        progress_bar_refresh_rate=0,
-        weights_summary=None,
-    )
-    assert model.called == []
-    trainer.fit(model)
-    expected = [
-        'setup_fit',
-        'on_fit_start',
-        'on_pretrain_routine_start',
-        'on_pretrain_routine_end',
-        'on_train_start',
-        'on_epoch_start',
-        'on_train_epoch_start',
-        *(model.train_batch * train_batches),
-        'training_epoch_end',
-        'on_train_epoch_end',
-        'on_epoch_end',
-        'on_save_checkpoint',  # from train epoch end
-        'on_train_end',
-        'on_fit_end',
-        'teardown_fit',
-    ]
-    assert model.called == expected
-
-
 def test_trainer_model_hook_system_validate(tmpdir):
     model = HookedModel()
     trainer = Trainer(

From deaa887f776d2af2bd2fe939e64eaeb9beed753a Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 13:41:03 +0200
Subject: [PATCH 20/32] Unrelated change

---
 pl_examples/bug_report_model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index 70591f3649fd9..abb65ba86fd93 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -1,5 +1,4 @@
 import os
-from unittest.mock import MagicMock
 
 import torch
 from torch.utils.data import DataLoader, Dataset

From fccd7c4727bac86d3abdd60b5313b169d4deb7c8 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 13:41:32 +0200
Subject: [PATCH 21/32] Unrelated change

---
 tests/models/test_hooks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index dca56a39cd274..913f403a14dd3 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -502,7 +502,6 @@ def test_trainer_model_hook_system_fit_no_val(tmpdir):
         weights_summary=None,
     )
     assert model.called == []
-
     trainer.fit(model)
     expected = [
         'setup_fit',

From 747a39912203c6dd0865e0152e7c43acb5b25f67 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 17:20:32 +0200
Subject: [PATCH 22/32] Remove comment

---
 tests/trainer/loops/test_training_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py
index 2e17f57ec9479..da4ecbe5a9f05 100644
--- a/tests/trainer/loops/test_training_loop.py
+++ b/tests/trainer/loops/test_training_loop.py
@@ -141,4 +141,4 @@ def validation_step(self, *args):
 
     assert trainer.current_epoch == 0
     assert trainer.global_step == 5
-    assert model.validation_called_at == (0, 4)  # TODO(@carmocca): should be 5 - will be fixed in next PR
+    assert model.validation_called_at == (0, 4)

From a2943e5f60b89bd4ac9ce6f362b93219ca2fe09f Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 17:45:29 +0200
Subject: [PATCH 23/32] Add `ModelPruning(prune_on_train_epoch_end=bool)` to
 choose when to apply pruning

---
 pytorch_lightning/callbacks/pruning.py | 22 ++++++++++++++++++----
 tests/callbacks/test_pruning.py        | 13 +++++++++----
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/callbacks/pruning.py b/pytorch_lightning/callbacks/pruning.py
index b044fc3e6e38a..e7da752d1c844 100644
--- a/pytorch_lightning/callbacks/pruning.py
+++ b/pytorch_lightning/callbacks/pruning.py
@@ -71,6 +71,7 @@ def __init__(
         pruning_dim: Optional[int] = None,
         pruning_norm: Optional[int] = None,
         verbose: int = 0,
+        prune_on_train_epoch_end: bool = True,
     ) -> None:
         """
         Model pruning Callback, using PyTorch's prune utilities.
@@ -141,6 +142,9 @@ def __init__(
 
             verbose: Verbosity level. 0 to disable, 1 to log overall sparsity, 2 to log per-layer sparsity
 
+            prune_on_train_epoch_end: whether to apply pruning at the end of the training epoch.
+                If this is ``False``, then the check runs at the end of the validation epoch.
+
         Raises:
             MisconfigurationException:
                 If ``parameter_names`` is neither ``"weight"`` nor ``"bias"``,
@@ -155,6 +159,7 @@ def __init__(
         self._parameters_to_prune = parameters_to_prune
         self._use_lottery_ticket_hypothesis = use_lottery_ticket_hypothesis
         self._resample_parameters = resample_parameters
+        self._prune_on_train_epoch_end = prune_on_train_epoch_end
         self._parameter_names = parameter_names or self.PARAMETER_NAMES
         self._global_kwargs: Dict[str, Any] = {}
         self._original_layers: Optional[Dict[int, _LayerRef]] = None
@@ -381,8 +386,7 @@ def on_before_accelerator_backend_setup(self, trainer: 'pl.Trainer', pl_module:
                 self._original_layers.setdefault(id_, _LayerRef(data=deepcopy(module), names=[]))
                 self._original_layers[id_]["names"].append((i, name))
 
-    def on_train_epoch_end(self, trainer: 'pl.Trainer', pl_module: LightningModule) -> None:  # type: ignore
-        current_epoch = pl_module.current_epoch
+    def _run_pruning(self, current_epoch: int) -> None:
         prune = self._apply_pruning(current_epoch) if callable(self._apply_pruning) else self._apply_pruning
         amount = self.amount(current_epoch) if callable(self.amount) else self.amount
         if not prune or not amount:
@@ -395,9 +399,19 @@ def on_train_epoch_end(self, trainer: 'pl.Trainer', pl_module: LightningModule)
         ):
             self.apply_lottery_ticket_hypothesis()
 
+    def on_train_epoch_end(self, trainer: 'pl.Trainer', pl_module: LightningModule) -> None:  # type: ignore
+        if self._prune_on_train_epoch_end:
+            rank_zero_debug("`ModelPruning.on_train_epoch_end`. Applying pruning")
+            self._run_pruning(pl_module.current_epoch)
+
+    def on_validation_epoch_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None:
+        if not trainer.sanity_checking and not self._prune_on_train_epoch_end:
+            rank_zero_debug("`ModelPruning.on_validation_epoch_end`. Applying pruning")
+            self._run_pruning(pl_module.current_epoch)
+
     def on_train_end(self, trainer: 'pl.Trainer', pl_module: LightningModule) -> None:
         if self._make_pruning_permanent:
-            rank_zero_debug("`ModelPruning.on_train_end`. Pruning is made permanent for this checkpoint.")
+            rank_zero_debug("`ModelPruning.on_train_end`. Pruning is made permanent for this checkpoint")
             self.make_pruning_permanent(pl_module)
 
     def on_save_checkpoint(
@@ -407,7 +421,7 @@ def on_save_checkpoint(
         checkpoint: Dict[str, Any],
     ) -> Dict[str, Any]:
         if self._make_pruning_permanent:
-            rank_zero_debug("`ModelPruning.on_save_checkpoint`. Pruning is made permanent for this checkpoint.")
+            rank_zero_debug("`ModelPruning.on_save_checkpoint`. Pruning is made permanent for this checkpoint")
             prev_device = pl_module.device
             # prune a copy so training can continue with the same buffers
             copy = deepcopy(pl_module.to("cpu"))
diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index 5a06a92a13fa8..5710e9d1166c0 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -270,7 +270,8 @@ def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool
     assert not has_pruning if make_pruning_permanent else has_pruning
 
 
-def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog):
+@pytest.mark.parametrize("on_train_epoch_end", (False, True))
+def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog, on_train_epoch_end):
     """
     When a model is saved multiple times and make_permanent=True, we need to
     make sure a copy is pruned and not the trained model if we want to continue
@@ -282,15 +283,19 @@ class TestPruning(ModelPruning):
 
         def on_save_checkpoint(self, trainer, pl_module, checkpoint):
             super().on_save_checkpoint(trainer, pl_module, checkpoint)
-            assert "layer.mlp_3.weight_orig" not in checkpoint["state_dict"]
-            assert hasattr(pl_module.layer.mlp_3, "weight_orig")
+            if not on_train_epoch_end:
+                # these checks only work if pruning on `validation_epoch_end`
+                # because `on_save_checkpoint` is called before `on_train_epoch_end`
+                assert "layer.mlp_3.weight_orig" not in checkpoint["state_dict"]
+                assert hasattr(pl_module.layer.mlp_3, "weight_orig")
 
     model = TestModel()
     pruning_callback = TestPruning(
         "random_unstructured",
         parameters_to_prune=[(model.layer.mlp_3, "weight")],
         verbose=1,
-        make_pruning_permanent=True
+        make_pruning_permanent=True,
+        prune_on_train_epoch_end=on_train_epoch_end,
     )
     ckpt_callback = ModelCheckpoint(monitor="test", save_top_k=2, save_last=True)
     trainer = Trainer(callbacks=[pruning_callback, ckpt_callback], max_epochs=3, progress_bar_refresh_rate=0)

From c819df5b95dae242680a4d0199dfe0d5be0c3bb7 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 17:50:19 +0200
Subject: [PATCH 24/32] Update CHANGELOG

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 14dbee08920be..ab57e7bed6c0e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added LightningCLI support for config files on object stores ([#7521](https://github.com/PyTorchLightning/pytorch-lightning/pull/7521))
 
 
+- Added `ModelPruning(prune_on_train_epoch_end)` to choose when to apply pruning ([#7704](https://github.com/PyTorchLightning/pytorch-lightning/pull/7704))
+
+
 - Added support for checkpointing based on a provided time interval during training ([#7515](https://github.com/PyTorchLightning/pytorch-lightning/pull/7515))
 
 

From c0201bc74aeb65681c38b0b1b3e28cfbd14a0d1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 18:15:15 +0200
Subject: [PATCH 25/32] Update CHANGELOG.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab57e7bed6c0e..ea067090ba1b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,7 +18,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added LightningCLI support for config files on object stores ([#7521](https://github.com/PyTorchLightning/pytorch-lightning/pull/7521))
 
 
-- Added `ModelPruning(prune_on_train_epoch_end)` to choose when to apply pruning ([#7704](https://github.com/PyTorchLightning/pytorch-lightning/pull/7704))
+- Added `ModelPruning(prune_on_train_epoch_end=True|False)` to choose when to apply pruning ([#7704](https://github.com/PyTorchLightning/pytorch-lightning/pull/7704))
 
 
 - Added support for checkpointing based on a provided time interval during training ([#7515](https://github.com/PyTorchLightning/pytorch-lightning/pull/7515))

From 927ab83542d9db9c07ef7ccc18f933b47171eeef Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 18:37:14 +0200
Subject: [PATCH 26/32] Test with regex

---
 tests/callbacks/test_pruning.py | 35 ++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index 5710e9d1166c0..0500fd6b4e6fc 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from collections import OrderedDict
 from logging import INFO
 from typing import Union
@@ -250,17 +251,20 @@ def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool
 
     actual = [m.strip() for m in caplog.messages]
     actual = [m for m in actual if m.startswith("Applied")]
-    assert actual == [
-        "Applied `L1Unstructured`. Pruned: 0/1122 (0.00%) -> 544/1122 (48.48%)",
-        "Applied `L1Unstructured` to `Linear(in_features=32, out_features=32, bias=True).weight` with amount=0.5. Pruned: 0 (0.00%) -> 503 (49.12%)",  # noqa: E501
-        "Applied `L1Unstructured` to `Linear(in_features=32, out_features=2, bias=True).weight` with amount=0.5. Pruned: 0 (0.00%) -> 41 (64.06%)",  # noqa: E501
-        "Applied `RandomUnstructured`. Pruned: 544/1122 (48.48%) -> 680/1122 (60.61%)",
-        "Applied `RandomUnstructured` to `Linear(in_features=32, out_features=32, bias=True).weight` with amount=0.25. Pruned: 503 (49.12%) -> 629 (61.43%)",  # noqa: E501
-        "Applied `RandomUnstructured` to `Linear(in_features=32, out_features=2, bias=True).weight` with amount=0.25. Pruned: 41 (64.06%) -> 51 (79.69%)",  # noqa: E501
-        "Applied `L1Unstructured`. Pruned: 680/1122 (60.61%) -> 884/1122 (78.79%)",
-        "Applied `L1Unstructured` to `Linear(in_features=32, out_features=32, bias=True).weight` with amount=0.5. Pruned: 629 (61.43%) -> 827 (80.76%)",  # noqa: E501
-        "Applied `L1Unstructured` to `Linear(in_features=32, out_features=2, bias=True).weight` with amount=0.5. Pruned: 51 (79.69%) -> 57 (89.06%)",  # noqa: E501
+    percentage = r"\d+(?:\.\d+)?%"
+    expected = [
+        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)",
+        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ \({percentage}\)",  # noqa: E501
+        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ \({percentage}\)",  # noqa: E501
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)",
+        rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.25. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)",  # noqa: E501
+        rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.25. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)",  # noqa: E501
+        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)",
+        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)",  # noqa: E501
+        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)",  # noqa: E501
     ]
+    expected = [re.compile(s) for s in expected]
+    assert all(regex.match(s) for s, regex in zip(actual, expected))
 
     filepath = str(tmpdir / "foo.ckpt")
     trainer.save_checkpoint(filepath)
@@ -304,11 +308,14 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
 
     actual = [m.strip() for m in caplog.messages]
     actual = [m for m in actual if m.startswith("Applied")]
-    assert actual == [
-        "Applied `RandomUnstructured`. Pruned: 0/66 (0.00%) -> 32/66 (48.48%)",
-        "Applied `RandomUnstructured`. Pruned: 32/66 (48.48%) -> 48/66 (72.73%)",
-        "Applied `RandomUnstructured`. Pruned: 48/66 (72.73%) -> 56/66 (84.85%)",
+    percentage = r"\d+(?:\.\d+)?%"
+    expected = [
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)",
     ]
+    expected = [re.compile(s) for s in expected]
+    assert all(regex.match(s) for s, regex in zip(actual, expected))
 
     # removed on_train_end
     assert not hasattr(model.layer.mlp_3, "weight_orig")

From 08a5c4bfd912d83a47cc5e802ae0d91ddbd506f9 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 18:38:59 +0200
Subject: [PATCH 27/32] Minor change

---
 tests/callbacks/test_pruning.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index 0500fd6b4e6fc..65ec440dcee75 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -251,17 +251,17 @@ def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool
 
     actual = [m.strip() for m in caplog.messages]
     actual = [m for m in actual if m.startswith("Applied")]
-    percentage = r"\d+(?:\.\d+)?%"
+    percentage = r"\(\d+(?:\.\d+)?%\)"
     expected = [
-        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)",
-        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ \({percentage}\)",  # noqa: E501
-        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ \({percentage}\)",  # noqa: E501
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)",
-        rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.25. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)",  # noqa: E501
-        rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.25. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)",  # noqa: E501
-        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)",
-        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)",  # noqa: E501
-        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)",  # noqa: E501
+        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ {percentage}",  # noqa: E501
+        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ {percentage}",  # noqa: E501
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.25. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
+        rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.25. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
+        rf"Applied `L1Unstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}",
+        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
+        rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: \d+ {percentage} -> \d+ {percentage}",  # noqa: E501
     ]
     expected = [re.compile(s) for s in expected]
     assert all(regex.match(s) for s, regex in zip(actual, expected))
@@ -308,11 +308,11 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
 
     actual = [m.strip() for m in caplog.messages]
     actual = [m for m in actual if m.startswith("Applied")]
-    percentage = r"\d+(?:\.\d+)?%"
+    percentage = r"\(\d+(?:\.\d+)?%\)"
     expected = [
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)",
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)",
-        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
+        rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}",
     ]
     expected = [re.compile(s) for s in expected]
     assert all(regex.match(s) for s, regex in zip(actual, expected))

From b0c630f943466fe4b95940ed1964da5e42f015a7 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 May 2021 18:40:07 +0200
Subject: [PATCH 28/32] No need to seed anymore

---
 tests/callbacks/test_pruning.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index 65ec440dcee75..d4957905454d8 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -22,7 +22,7 @@
 from torch import nn
 from torch.nn import Sequential
 
-from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint, ModelPruning
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
@@ -225,7 +225,6 @@ def apply_lottery_ticket_hypothesis(self):
 
 @pytest.mark.parametrize("make_pruning_permanent", (False, True))
 def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool):
-    seed_everything(0)
     model = TestModel()
     pruning_kwargs = {
         'parameters_to_prune': [(model.layer.mlp_1, "weight"), (model.layer.mlp_3, "weight")],
@@ -281,7 +280,6 @@ def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog, on_train_e
     make sure a copy is pruned and not the trained model if we want to continue
     with the same pruning buffers.
     """
-    seed_everything(0)
 
     class TestPruning(ModelPruning):
 

From 4284a63ec6cc5dd11ae7659c4a117eb21639a0b3 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 26 May 2021 01:42:05 +0200
Subject: [PATCH 29/32] Update CHANGELOG

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c07b13ac93612..9de22de41da69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -67,7 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506))
     * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507))
     * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526))
-
+    * Simplified "should run validation" logic ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682))
+    * Refactored "should run validation" logic when the trainer is signaled to stop ([#7701](https://github.com/PyTorchLightning/pytorch-lightning/pull/7701))
+    * Always run validation inside the training loop epoch ([#7357](https://github.com/PyTorchLightning/pytorch-lightning/pull/7357))
 
 - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/))
 

From 09f5d39df1fbef6b5a28717cc9d54f41f39924cd Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 26 May 2021 02:54:06 +0200
Subject: [PATCH 30/32] Update CHANGELOG

---
 CHANGELOG.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9de22de41da69..a065b846b4bb1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -62,6 +62,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Changed `clip_grad_norm` to use `torch.nn.utils.clip_grad_norm_` ([#7025](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025))
 
+
+- Validation is now always run inside the training epoch scope ([#7357](https://github.com/PyTorchLightning/pytorch-lightning/pull/7357))
+
+
 - Refactored Loops
     * Moved attributes `global_step`, `current_epoch`, `max/min_steps`, `max/min_epochs`, `batch_idx`, and `total_batch_idx` to TrainLoop ([#7437](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025))
     * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506))
@@ -69,7 +73,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526))
     * Simplified "should run validation" logic ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682))
     * Refactored "should run validation" logic when the trainer is signaled to stop ([#7701](https://github.com/PyTorchLightning/pytorch-lightning/pull/7701))
-    * Always run validation inside the training loop epoch ([#7357](https://github.com/PyTorchLightning/pytorch-lightning/pull/7357))
 
 - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/))
 

From 95eeda127f71ba883bf764ca2c2de2c793c48961 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 26 May 2021 02:54:32 +0200
Subject: [PATCH 31/32] Update CHANGELOG

---
 CHANGELOG.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a065b846b4bb1..8dfcdbb08d346 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -71,8 +71,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506))
     * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507))
     * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526))
-    * Simplified "should run validation" logic ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682))
-    * Refactored "should run validation" logic when the trainer is signaled to stop ([#7701](https://github.com/PyTorchLightning/pytorch-lightning/pull/7701))
 
 - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/))
 

From 1b18342ba47491df84053b0e9f5334630da50869 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 26 May 2021 02:55:44 +0200
Subject: [PATCH 32/32] Update CHANGELOG

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8dfcdbb08d346..199aa70329e24 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -72,6 +72,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507))
     * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526))
 
+
 - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/))