From df4d846dc6a9759eb5a9ef616b8ce41436625f7f Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 4 May 2021 23:22:45 +0200 Subject: [PATCH 01/32] Refactor global step update --- pytorch_lightning/trainer/training_loop.py | 40 +++++----------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 8c510f08a83fc..67bfa8a551b57 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -512,28 +512,17 @@ def run_training_epoch(self): self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics) self.trainer.checkpoint_connector.has_trained = True - # max steps reached, end training - if ( - self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step + 1 - and self._accumulated_batches_reached() - ): - break - - # end epoch early - # stop when the flag is changed or we've gone past the amount - # requested in the batches - if self.trainer.should_stop: - break - self.trainer.total_batch_idx += 1 - # stop epoch if we limited the number of training batches - if self._num_training_batches_reached(is_last_batch): - break - # progress global step according to grads progress self.increment_accumulated_grad_global_step() + max_steps_reached = ( + self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step + ) + if max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(is_last_batch): + break + if batch_idx is None: # dataloader/iterator did not produce a batch return @@ -555,15 +544,6 @@ def run_training_epoch(self): if should_train_only: self.check_checkpoint_callback(True) - if should_check_val: - self.trainer.validating = True - self.trainer.run_evaluation(on_epoch=True) - self.trainer.training = True - - # increment the global step once - # progress global step according to grads progress - self.increment_accumulated_grad_global_step() - def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None: # inform logger the batch loop has finished self.trainer.logger_connector.on_train_epoch_end() @@ -863,16 +843,12 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool, on_epoch: bo elif self.trainer.val_check_batch != float('inf'): is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0 - # Note: num_training_batches is also inf for iterable datasets with no length defined - epoch_end_val_check = (batch_idx + 1) % self.trainer.num_training_batches == 0 is_last_batch_for_infinite_dataset = is_last_batch and self.trainer.val_check_batch == float("inf") if on_epoch: - return ( - is_val_check_batch and epoch_end_val_check - ) or self.trainer.should_stop or is_last_batch_for_infinite_dataset + return is_val_check_batch or self.trainer.should_stop or is_last_batch_for_infinite_dataset else: - return is_val_check_batch and not epoch_end_val_check + return is_val_check_batch def build_train_args(self, batch, batch_idx, opt_idx, hiddens): # enable not needing to add opt_idx to training_step From 8e402a43f77b7c05cea0669c6450e236584a2020 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 6 May 2021 17:55:55 +0200 Subject: [PATCH 02/32] WIP --- pytorch_lightning/trainer/training_loop.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 67bfa8a551b57..3598e31abc3d0 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -458,10 +458,7 @@ def run_training_epoch(self): train_dataloader = self.trainer.data_connector.get_profiled_train_dataloader(train_dataloader) dataloader_idx = 0 - val_loop_called = False - batch_idx = None - is_last_batch = None for batch_idx, (batch, is_last_batch) in train_dataloader: self.trainer.batch_idx = batch_idx @@ -500,7 +497,6 @@ def run_training_epoch(self): self.trainer.validating = True self.trainer.run_evaluation() self.trainer.training = True - val_loop_called = True # ----------------------------------------- # SAVE LOGGERS (ie: Tensorboard, etc...) @@ -527,23 +523,24 @@ def run_training_epoch(self): # dataloader/iterator did not produce a batch return + #self.trainer.global_step -= 1 + # handle epoch_output on epoch end self.on_train_epoch_end(epoch_output) # log epoch metrics self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output) - should_check_val = self._should_check_val_fx(batch_idx, is_last_batch, on_epoch=True) + # TODO: make sure we don't update these again if already updated inside the batch loop + self.trainer.optimizer_connector.update_learning_rates(interval='epoch') + should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches) should_train_only = self.trainer.disable_validation or should_skip_eval - - # update epoch level lr_schedulers if no val loop outside train loop is triggered - if (val_loop_called and not should_check_val) or should_train_only: - self.trainer.optimizer_connector.update_learning_rates(interval='epoch') - if should_train_only: self.check_checkpoint_callback(True) + #self.trainer.global_step += 1 + def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None: # inform logger the batch loop has finished self.trainer.logger_connector.on_train_epoch_end() From 2404d58914e1bfada9eb8e274a9881963a5da5e5 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 24 May 2021 13:24:38 +0200 Subject: [PATCH 03/32] WIP --- pytorch_lightning/trainer/training_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 429a8ea2e678c..aee116e1c9c45 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -550,12 +550,12 @@ def run_training_epoch(self): # log epoch metrics self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output) - # TODO: make sure we don't update these again if already updated inside the batch loop self.trainer.optimizer_connector.update_learning_rates(interval='epoch') should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches) should_train_only = self.trainer.disable_validation or should_skip_eval if should_train_only: + # if validation has run, this should have been called already self.check_checkpoint_callback(True) #self.trainer.global_step += 1 @@ -843,7 +843,7 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs): # track gradients result.grad_norm_dict = self.track_and_norm_grad(optimizer=optimizer) - def update_train_loop_lr_schedulers(self, monitor_metrics=None): + def update_train_loop_lr_schedulers(self, interval: str, monitor_metrics: Dict[str, _METRIC] = None) -> None: num_accumulated_batches_reached = self._accumulated_batches_reached() num_training_batches_reached = self._num_training_batches_reached() From 3e5e0876f905d8616cd1156124a504a9f1fc9a2f Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 24 May 2021 13:47:27 +0200 Subject: [PATCH 04/32] WIP --- .../trainer/connectors/optimizer_connector.py | 21 ++++++-------- pytorch_lightning/trainer/trainer.py | 13 +-------- pytorch_lightning/trainer/training_loop.py | 29 +++++++++---------- 3 files changed, 24 insertions(+), 39 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py index e7fbdf9b18c02..2797504288bd3 100644 --- a/pytorch_lightning/trainer/connectors/optimizer_connector.py +++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py @@ -11,30 +11,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, List, Optional +from typing import List, Optional +from weakref import proxy +import pytorch_lightning as pl from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException class OptimizerConnector: - def __init__(self, trainer): - self.trainer = trainer + def __init__(self, trainer: 'pl.Trainer') -> None: + self.trainer = proxy(trainer) - def on_trainer_init(self): + def on_trainer_init(self) -> None: self.trainer.lr_schedulers = [] self.trainer.optimizers = [] self.trainer.optimizer_frequencies = [] - def update_learning_rates( - self, interval: str, monitor_metrics: Optional[Dict[str, Any]] = None, opt_indices: Optional[List[int]] = None - ): + def update_learning_rates(self, interval: str, opt_indices: Optional[List[int]] = None) -> None: """Update learning rates. Args: interval: either 'epoch' or 'step'. - monitor_metrics: dict of possible values to monitor + opt_indices: indices of the optimizers to update. """ if not self.trainer.lr_schedulers or not self.trainer.lightning_module.automatic_optimization: return @@ -55,10 +55,7 @@ def update_learning_rates( monitor_key, monitor_val = None, None if lr_scheduler['reduce_on_plateau']: monitor_key = lr_scheduler['monitor'] - monitor_val = ( - monitor_metrics.get(monitor_key) if monitor_metrics is not None else - self.trainer.logger_connector.callback_metrics.get(monitor_key) - ) + monitor_val = self.trainer.logger_connector.callback_metrics.get(monitor_key) if monitor_val is None: if lr_scheduler.get('strict', True): avail_metrics = list(self.trainer.logger_connector.callback_metrics.keys()) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 6a20625978e39..db7f99add85e9 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -923,7 +923,7 @@ def _run_train(self) -> None: self.state.stage = None raise - def _run_evaluation(self, on_epoch: bool = False) -> _EVALUATE_OUTPUT: + def _run_evaluation(self) -> _EVALUATE_OUTPUT: if not (self.evaluating or self.sanity_checking): rank_zero_warn( f"`trainer._run_evaluation()` was called but the running stage is set to {self.state.stage}." @@ -1005,17 +1005,6 @@ def _run_evaluation(self, on_epoch: bool = False) -> _EVALUATE_OUTPUT: # hook self.evaluation_loop.on_evaluation_epoch_end() - # update epoch-level lr_schedulers - if on_epoch: - self.optimizer_connector.update_learning_rates( - interval='epoch', - opt_indices=[ - opt_idx for opt_idx, _ in self.train_loop.get_active_optimizers( - batch_idx=(self.train_loop.total_batch_idx - 1) - ) # Select the optimizers which were used in the last batch of the epoch - ], - ) - # log epoch metrics eval_loop_results = self.logger_connector.get_evaluate_epoch_results() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index aee116e1c9c45..a60ab7a5ccc32 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -34,6 +34,7 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.parsing import AttributeDict from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature +from pytorch_lightning.utilities.types import _METRIC from pytorch_lightning.utilities.warnings import WarningCache @@ -523,11 +524,10 @@ def run_training_epoch(self): self.save_loggers_on_train_batch_end() # update LR schedulers - monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics) - self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics) + self.update_lr_schedulers('step') self.trainer.checkpoint_connector.has_trained = True - self.trainer.total_batch_idx += 1 + self.total_batch_idx += 1 # progress global step according to grads progress self.increment_accumulated_grad_global_step() @@ -550,7 +550,7 @@ def run_training_epoch(self): # log epoch metrics self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output) - self.trainer.optimizer_connector.update_learning_rates(interval='epoch') + self.update_lr_schedulers('epoch') should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches) should_train_only = self.trainer.disable_validation or should_skip_eval @@ -843,17 +843,16 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs): # track gradients result.grad_norm_dict = self.track_and_norm_grad(optimizer=optimizer) - def update_train_loop_lr_schedulers(self, interval: str, monitor_metrics: Dict[str, _METRIC] = None) -> None: - num_accumulated_batches_reached = self._accumulated_batches_reached() - num_training_batches_reached = self._num_training_batches_reached() - - if num_accumulated_batches_reached or num_training_batches_reached: - # update lr - self.trainer.optimizer_connector.update_learning_rates( - interval="step", - monitor_metrics=monitor_metrics, - opt_indices=[opt_idx for opt_idx, _ in self.get_active_optimizers()], - ) + def update_lr_schedulers(self, interval: str) -> None: + if interval == "step": + finished_accumulation = self._accumulated_batches_reached() + finished_epoch = self._num_training_batches_reached() + if not finished_accumulation and not finished_epoch: + return + self.trainer.optimizer_connector.update_learning_rates( + interval=interval, + opt_indices=[opt_idx for opt_idx, _ in self.get_active_optimizers()], + ) def increment_accumulated_grad_global_step(self): num_accumulated_batches_reached = self._accumulated_batches_reached() From d274165c56cffb6ed7c69179116a9f194ba833ba Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 24 May 2021 14:34:23 +0200 Subject: [PATCH 05/32] Fix tests --- pytorch_lightning/trainer/training_loop.py | 10 +- tests/models/test_hooks.py | 33 +++++- tests/trainer/loops/test_training_loop.py | 112 --------------------- 3 files changed, 36 insertions(+), 119 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index a60ab7a5ccc32..7707e40d5a611 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -532,9 +532,7 @@ def run_training_epoch(self): # progress global step according to grads progress self.increment_accumulated_grad_global_step() - max_steps_reached = ( - self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step - ) + max_steps_reached = (self.max_steps is not None and self.max_steps <= self.global_step) if max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(is_last_batch): break @@ -542,7 +540,8 @@ def run_training_epoch(self): # dataloader/iterator did not produce a batch return - #self.trainer.global_step -= 1 + # TODO + # self.global_step -= 1 # handle epoch_output on epoch end self.on_train_epoch_end(epoch_output) @@ -558,7 +557,8 @@ def run_training_epoch(self): # if validation has run, this should have been called already self.check_checkpoint_callback(True) - #self.trainer.global_step += 1 + # TODO + # self.global_step += 1 def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None: # inform logger the batch loop has finished diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 78f8d2c0a94e9..789afad2909cc 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -258,6 +258,26 @@ def __init__(self): super().__init__() self.called = [] + def training_step(self, *args, **kwargs): + self.called.append("training_step") + return super().training_step(*args, **kwargs) + + def optimizer_zero_grad(self, *args, **kwargs): + self.called.append("optimizer_zero_grad") + super().optimizer_zero_grad(*args, **kwargs) + + def backward(self, *args, **kwargs): + self.called.append("backward") + super().backward(*args, **kwargs) + + def optimizer_step(self, *args, **kwargs): + super().optimizer_step(*args, **kwargs) + self.called.append("optimizer_step") # append after as closure calls other methods + + def training_epoch_end(self, *args, **kwargs): + self.called.append("training_epoch_end") + super().training_epoch_end(*args, **kwargs) + def on_after_backward(self): self.called.append("on_after_backward") super().on_after_backward() @@ -455,18 +475,24 @@ def teardown(self, stage=None): 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'training_step', 'on_before_zero_grad', + 'optimizer_zero_grad', + 'backward', 'on_after_backward', + 'optimizer_step', 'on_train_batch_end', 'on_train_batch_start', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'training_step', 'on_before_zero_grad', + 'optimizer_zero_grad', + 'backward', 'on_after_backward', + 'optimizer_step', 'on_train_batch_end', - 'on_train_epoch_end', - 'on_epoch_end', 'on_validation_model_eval', 'on_validation_start', 'on_epoch_start', @@ -481,6 +507,9 @@ def teardown(self, stage=None): 'on_save_checkpoint', 'on_validation_end', 'on_validation_model_train', + 'training_epoch_end', + 'on_train_epoch_end', + 'on_epoch_end', 'on_train_end', 'on_fit_end', 'teardown_fit', diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py index 2d32d8c8878e4..89a2ae0ec0d61 100644 --- a/tests/trainer/loops/test_training_loop.py +++ b/tests/trainer/loops/test_training_loop.py @@ -17,118 +17,6 @@ from tests.helpers import BoringModel -def test_training_loop_hook_call_order(tmpdir): - """Tests that hooks / methods called in the training loop are in the correct order as detailed in the docs: - https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#hooks""" - - class HookedModel(BoringModel): - - def __init__(self): - super().__init__() - self.called = [] - - def on_epoch_start(self): - self.called.append("on_epoch_start") - super().on_epoch_start() - - def on_train_epoch_start(self): - self.called.append("on_train_epoch_start") - super().on_train_epoch_start() - - def on_train_batch_start(self, batch, batch_idx, dataloader_idx): - self.called.append("on_train_batch_start") - super().on_train_batch_start(batch, batch_idx, dataloader_idx) - - def training_step(self, batch, batch_idx): - self.called.append("training_step") - return super().training_step(batch, batch_idx) - - def on_before_zero_grad(self, optimizer): - self.called.append("on_before_zero_grad") - super().on_before_zero_grad(optimizer) - - def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx): - self.called.append("optimizer_zero_grad") - super().optimizer_zero_grad(epoch, batch_idx, optimizer, optimizer_idx) - - def backward(self, loss, optimizer, optimizer_idx, *args, **kwargs): - self.called.append("backward") - super().backward(loss, optimizer, optimizer_idx, *args, **kwargs) - - def on_after_backward(self): - self.called.append("on_after_backward") - super().on_after_backward() - - def optimizer_step( - self, - epoch, - batch_idx, - optimizer, - optimizer_idx, - optimizer_closure, - on_tpu, - using_native_amp, - using_lbfgs, - ): - super().optimizer_step( - epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs - ) - self.called.append("optimizer_step") # append after as closure calls other methods - - def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): - self.called.append("on_train_batch_end") - super().on_train_batch_end(outputs, batch, batch_idx, dataloader_idx) - - def training_epoch_end(self, outputs): - self.called.append("training_epoch_end") - super().training_epoch_end(outputs) - - def on_train_epoch_end(self, outputs): - self.called.append("on_train_epoch_end") - super().on_train_epoch_end(outputs) - - def on_epoch_end(self): - self.called.append("on_epoch_end") - super().on_epoch_end() - - model = HookedModel() - - # fit model - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_val_batches=1, - limit_train_batches=1, - limit_test_batches=1, - progress_bar_refresh_rate=0, - weights_summary=None, - ) - - assert model.called == [] - - trainer.fit(model) - expected = [ - "on_epoch_start", # validation - "on_epoch_end", - "on_epoch_start", # training - "on_train_epoch_start", - "on_train_batch_start", - "training_step", - "on_before_zero_grad", - "optimizer_zero_grad", - "backward", - "on_after_backward", - "optimizer_step", - "on_train_batch_end", - "training_epoch_end", - "on_train_epoch_end", - "on_epoch_end", - "on_epoch_start", # validation - "on_epoch_end", - ] - assert model.called == expected - - def test_outputs_format(tmpdir): """Tests that outputs objects passed to model hooks and methods are consistent and in the correct format.""" From 517970a7328128c74641e42158ed1716474728c2 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 24 May 2021 15:52:20 +0200 Subject: [PATCH 06/32] Fix tests --- tests/callbacks/test_callbacks.py | 4 +- tests/models/test_hooks.py | 494 +++++++++++++++++------------- 2 files changed, 283 insertions(+), 215 deletions(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 9b048e022c45b..a22e72ce09184 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -83,8 +83,6 @@ def test_trainer_callback_hook_system_fit(_, tmpdir): call.on_after_backward(trainer, model), call.on_train_batch_end(trainer, model, ANY, ANY, 2, 0), call.on_batch_end(trainer, model), - call.on_train_epoch_end(trainer, model, ANY), - call.on_epoch_end(trainer, model), call.on_validation_start(trainer, model), call.on_epoch_start(trainer, model), call.on_validation_epoch_start(trainer, model), @@ -94,6 +92,8 @@ def test_trainer_callback_hook_system_fit(_, tmpdir): call.on_epoch_end(trainer, model), call.on_validation_end(trainer, model), call.on_save_checkpoint(trainer, model), # should take ANY but we are inspecting signature for BC + call.on_train_epoch_end(trainer, model, ANY), + call.on_epoch_end(trainer, model), call.on_train_end(trainer, model), call.on_fit_end(trainer, model), call.teardown(trainer, model, 'fit'), diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 789afad2909cc..f549b5d2d8ecc 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -249,206 +249,227 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx): assert trainer.global_step == (batch_idx_ + 1) * max_epochs -def test_trainer_model_hook_system(tmpdir): - """Test the LightningModule hook system.""" - - class HookedModel(BoringModel): - - def __init__(self): - super().__init__() - self.called = [] - - def training_step(self, *args, **kwargs): - self.called.append("training_step") - return super().training_step(*args, **kwargs) - - def optimizer_zero_grad(self, *args, **kwargs): - self.called.append("optimizer_zero_grad") - super().optimizer_zero_grad(*args, **kwargs) - - def backward(self, *args, **kwargs): - self.called.append("backward") - super().backward(*args, **kwargs) - - def optimizer_step(self, *args, **kwargs): - super().optimizer_step(*args, **kwargs) - self.called.append("optimizer_step") # append after as closure calls other methods - - def training_epoch_end(self, *args, **kwargs): - self.called.append("training_epoch_end") - super().training_epoch_end(*args, **kwargs) - - def on_after_backward(self): - self.called.append("on_after_backward") - super().on_after_backward() - - def on_before_zero_grad(self, *args, **kwargs): - self.called.append("on_before_zero_grad") - super().on_before_zero_grad(*args, **kwargs) - - def on_epoch_start(self): - self.called.append("on_epoch_start") - super().on_epoch_start() - - def on_epoch_end(self): - self.called.append("on_epoch_end") - super().on_epoch_end() - - def on_fit_start(self): - self.called.append("on_fit_start") - super().on_fit_start() - - def on_fit_end(self): - self.called.append("on_fit_end") - super().on_fit_end() +class HookedModel(BoringModel): + + def __init__(self): + super().__init__() + self.called = [] + self.train_batch = [ + 'on_train_batch_start', + 'on_before_batch_transfer', + 'transfer_batch_to_device', + 'on_after_batch_transfer', + 'training_step', + 'on_before_zero_grad', + 'optimizer_zero_grad', + 'backward', + 'on_after_backward', + 'optimizer_step', + 'on_train_batch_end', + ] + self.val_batch = [ + 'on_validation_batch_start', + 'on_before_batch_transfer', + 'transfer_batch_to_device', + 'on_after_batch_transfer', + 'on_validation_batch_end', + ] + + def training_step(self, *args, **kwargs): + self.called.append("training_step") + return super().training_step(*args, **kwargs) + + def on_before_zero_grad(self, *args, **kwargs): + self.called.append("on_before_zero_grad") + super().on_before_zero_grad(*args, **kwargs) + + def optimizer_zero_grad(self, *args, **kwargs): + self.called.append("optimizer_zero_grad") + super().optimizer_zero_grad(*args, **kwargs) + + def training_epoch_end(self, *args, **kwargs): + self.called.append("training_epoch_end") + super().training_epoch_end(*args, **kwargs) + + def backward(self, *args, **kwargs): + self.called.append("backward") + super().backward(*args, **kwargs) + + def on_after_backward(self): + self.called.append("on_after_backward") + super().on_after_backward() + + def optimizer_step(self, *args, **kwargs): + super().optimizer_step(*args, **kwargs) + self.called.append("optimizer_step") # append after as closure calls other methods + + def validation_epoch_end(self, *args, **kwargs): + self.called.append("validation_epoch_end") + super().validation_epoch_end(*args, **kwargs) + + def on_epoch_start(self): + self.called.append("on_epoch_start") + super().on_epoch_start() + + def on_epoch_end(self): + self.called.append("on_epoch_end") + super().on_epoch_end() + + def on_fit_start(self): + self.called.append("on_fit_start") + super().on_fit_start() + + def on_fit_end(self): + self.called.append("on_fit_end") + super().on_fit_end() + + def on_hpc_load(self, *args, **kwargs): + self.called.append("on_hpc_load") + super().on_hpc_load(*args, **kwargs) + + def on_hpc_save(self, *args, **kwargs): + self.called.append("on_hpc_save") + super().on_hpc_save(*args, **kwargs) + + def on_load_checkpoint(self, *args, **kwargs): + self.called.append("on_load_checkpoint") + super().on_load_checkpoint(*args, **kwargs) + + def on_save_checkpoint(self, *args, **kwargs): + self.called.append("on_save_checkpoint") + super().on_save_checkpoint(*args, **kwargs) + + def on_pretrain_routine_start(self): + self.called.append("on_pretrain_routine_start") + super().on_pretrain_routine_start() + + def on_pretrain_routine_end(self): + self.called.append("on_pretrain_routine_end") + super().on_pretrain_routine_end() + + def on_train_start(self): + self.called.append("on_train_start") + super().on_train_start() + + def on_train_end(self): + self.called.append("on_train_end") + super().on_train_end() + + def on_before_batch_transfer(self, *args, **kwargs): + self.called.append("on_before_batch_transfer") + return super().on_before_batch_transfer(*args, **kwargs) + + def transfer_batch_to_device(self, *args, **kwargs): + self.called.append("transfer_batch_to_device") + return super().transfer_batch_to_device(*args, **kwargs) + + def on_after_batch_transfer(self, *args, **kwargs): + self.called.append("on_after_batch_transfer") + return super().on_after_batch_transfer(*args, **kwargs) - def on_hpc_load(self, *args, **kwargs): - self.called.append("on_hpc_load") - super().on_hpc_load(*args, **kwargs) - - def on_hpc_save(self, *args, **kwargs): - self.called.append("on_hpc_save") - super().on_hpc_save(*args, **kwargs) - - def on_load_checkpoint(self, *args, **kwargs): - self.called.append("on_load_checkpoint") - super().on_load_checkpoint(*args, **kwargs) - - def on_save_checkpoint(self, *args, **kwargs): - self.called.append("on_save_checkpoint") - super().on_save_checkpoint(*args, **kwargs) - - def on_pretrain_routine_start(self): - self.called.append("on_pretrain_routine_start") - super().on_pretrain_routine_start() - - def on_pretrain_routine_end(self): - self.called.append("on_pretrain_routine_end") - super().on_pretrain_routine_end() - - def on_train_start(self): - self.called.append("on_train_start") - super().on_train_start() - - def on_train_end(self): - self.called.append("on_train_end") - super().on_train_end() - - def on_before_batch_transfer(self, *args, **kwargs): - self.called.append("on_before_batch_transfer") - return super().on_before_batch_transfer(*args, **kwargs) - - def transfer_batch_to_device(self, *args, **kwargs): - self.called.append("transfer_batch_to_device") - return super().transfer_batch_to_device(*args, **kwargs) - - def on_after_batch_transfer(self, *args, **kwargs): - self.called.append("on_after_batch_transfer") - return super().on_after_batch_transfer(*args, **kwargs) + def on_train_batch_start(self, *args, **kwargs): + self.called.append("on_train_batch_start") + super().on_train_batch_start(*args, **kwargs) - def on_train_batch_start(self, *args, **kwargs): - self.called.append("on_train_batch_start") - super().on_train_batch_start(*args, **kwargs) + def on_train_batch_end(self, *args, **kwargs): + self.called.append("on_train_batch_end") + super().on_train_batch_end(*args, **kwargs) - def on_train_batch_end(self, *args, **kwargs): - self.called.append("on_train_batch_end") - super().on_train_batch_end(*args, **kwargs) + def on_train_epoch_start(self): + self.called.append("on_train_epoch_start") + super().on_train_epoch_start() - def on_train_epoch_start(self): - self.called.append("on_train_epoch_start") - super().on_train_epoch_start() + def on_train_epoch_end(self): + self.called.append("on_train_epoch_end") + super().on_train_epoch_end() - def on_train_epoch_end(self): - self.called.append("on_train_epoch_end") - super().on_train_epoch_end() + def on_validation_start(self): + self.called.append("on_validation_start") + super().on_validation_start() - def on_validation_start(self): - self.called.append("on_validation_start") - super().on_validation_start() + def on_validation_end(self): + self.called.append("on_validation_end") + super().on_validation_end() - def on_validation_end(self): - self.called.append("on_validation_end") - super().on_validation_end() + def on_validation_batch_start(self, *args, **kwargs): + self.called.append("on_validation_batch_start") + super().on_validation_batch_start(*args, **kwargs) - def on_validation_batch_start(self, *args, **kwargs): - self.called.append("on_validation_batch_start") - super().on_validation_batch_start(*args, **kwargs) + def on_validation_batch_end(self, *args, **kwargs): + self.called.append("on_validation_batch_end") + super().on_validation_batch_end(*args, **kwargs) - def on_validation_batch_end(self, *args, **kwargs): - self.called.append("on_validation_batch_end") - super().on_validation_batch_end(*args, **kwargs) + def on_validation_epoch_start(self): + self.called.append("on_validation_epoch_start") + super().on_validation_epoch_start() - def on_validation_epoch_start(self): - self.called.append("on_validation_epoch_start") - super().on_validation_epoch_start() + def on_validation_epoch_end(self, *args, **kwargs): + self.called.append("on_validation_epoch_end") + super().on_validation_epoch_end(*args, **kwargs) - def on_validation_epoch_end(self, *args, **kwargs): - self.called.append("on_validation_epoch_end") - super().on_validation_epoch_end(*args, **kwargs) + def on_test_start(self): + self.called.append("on_test_start") + super().on_test_start() - def on_test_start(self): - self.called.append("on_test_start") - super().on_test_start() + def on_test_batch_start(self, *args, **kwargs): + self.called.append("on_test_batch_start") + super().on_test_batch_start(*args, **kwargs) - def on_test_batch_start(self, *args, **kwargs): - self.called.append("on_test_batch_start") - super().on_test_batch_start(*args, **kwargs) + def on_test_batch_end(self, *args, **kwargs): + self.called.append("on_test_batch_end") + super().on_test_batch_end(*args, **kwargs) - def on_test_batch_end(self, *args, **kwargs): - self.called.append("on_test_batch_end") - super().on_test_batch_end(*args, **kwargs) + def on_test_epoch_start(self): + self.called.append("on_test_epoch_start") + super().on_test_epoch_start() - def on_test_epoch_start(self): - self.called.append("on_test_epoch_start") - super().on_test_epoch_start() + def on_test_epoch_end(self, *args, **kwargs): + self.called.append("on_test_epoch_end") + super().on_test_epoch_end(*args, **kwargs) - def on_test_epoch_end(self, *args, **kwargs): - self.called.append("on_test_epoch_end") - super().on_test_epoch_end(*args, **kwargs) + def on_validation_model_eval(self): + self.called.append("on_validation_model_eval") + super().on_validation_model_eval() - def on_validation_model_eval(self): - self.called.append("on_validation_model_eval") - super().on_validation_model_eval() + def on_validation_model_train(self): + self.called.append("on_validation_model_train") + super().on_validation_model_train() - def on_validation_model_train(self): - self.called.append("on_validation_model_train") - super().on_validation_model_train() + def on_test_model_eval(self): + self.called.append("on_test_model_eval") + super().on_test_model_eval() - def on_test_model_eval(self): - self.called.append("on_test_model_eval") - super().on_test_model_eval() + def on_test_model_train(self): + self.called.append("on_test_model_train") + super().on_test_model_train() - def on_test_model_train(self): - self.called.append("on_test_model_train") - super().on_test_model_train() + def on_test_end(self): + self.called.append("on_test_end") + super().on_test_end() - def on_test_end(self): - self.called.append("on_test_end") - super().on_test_end() + def setup(self, stage=None): + self.called.append(f"setup_{stage}") + super().setup(stage=stage) - def setup(self, stage=None): - self.called.append(f"setup_{stage}") - super().setup(stage=stage) + def teardown(self, stage=None): + self.called.append(f"teardown_{stage}") + super().teardown(stage) - def teardown(self, stage=None): - self.called.append(f"teardown_{stage}") - super().teardown(stage) +def test_trainer_model_hook_system_fit(tmpdir): + """Test the LightningModule hook system.""" model = HookedModel() - - # fit model + train_batches = 2 + val_batches = 2 trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, - limit_val_batches=1, - limit_train_batches=2, - limit_test_batches=1, + limit_train_batches=train_batches, + limit_val_batches=val_batches, progress_bar_refresh_rate=0, weights_summary=None, ) - assert model.called == [] - trainer.fit(model) expected = [ 'setup_fit', @@ -459,11 +480,8 @@ def teardown(self, stage=None): 'on_validation_start', 'on_epoch_start', 'on_validation_epoch_start', - 'on_validation_batch_start', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'on_validation_batch_end', + *(model.val_batch * val_batches), + 'validation_epoch_end', 'on_validation_epoch_end', 'on_epoch_end', 'on_validation_end', @@ -471,37 +489,13 @@ def teardown(self, stage=None): 'on_train_start', 'on_epoch_start', 'on_train_epoch_start', - 'on_train_batch_start', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'training_step', - 'on_before_zero_grad', - 'optimizer_zero_grad', - 'backward', - 'on_after_backward', - 'optimizer_step', - 'on_train_batch_end', - 'on_train_batch_start', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'training_step', - 'on_before_zero_grad', - 'optimizer_zero_grad', - 'backward', - 'on_after_backward', - 'optimizer_step', - 'on_train_batch_end', + *(model.train_batch * train_batches), 'on_validation_model_eval', 'on_validation_start', 'on_epoch_start', 'on_validation_epoch_start', - 'on_validation_batch_start', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'on_validation_batch_end', + *(model.val_batch * val_batches), + 'validation_epoch_end', 'on_validation_epoch_end', 'on_epoch_end', 'on_save_checkpoint', @@ -516,8 +510,52 @@ def teardown(self, stage=None): ] assert model.called == expected + +def test_trainer_model_hook_system_fit_no_val(tmpdir): model = HookedModel() + train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_val_batches=0, + limit_train_batches=train_batches, + progress_bar_refresh_rate=0, + weights_summary=None, + ) + assert model.called == [] + trainer.fit(model) + expected = [ + 'setup_fit', + 'on_fit_start', + 'on_pretrain_routine_start', + 'on_pretrain_routine_end', + 'on_train_start', + 'on_epoch_start', + 'on_train_epoch_start', + *(model.train_batch * train_batches), + 'training_epoch_end', + 'on_train_epoch_end', + 'on_epoch_end', + 'on_save_checkpoint', # from train epoch end + 'on_save_checkpoint', # from on train end + 'on_train_end', + 'on_fit_end', + 'teardown_fit', + ] + assert model.called == expected + + +def test_trainer_model_hook_system_validate(tmpdir): + model = HookedModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_val_batches=1, + progress_bar_refresh_rate=0, + weights_summary=None, + ) + assert model.called == [] trainer.validate(model, verbose=False) expected = [ 'setup_validate', @@ -530,6 +568,7 @@ def teardown(self, stage=None): 'transfer_batch_to_device', 'on_after_batch_transfer', 'on_validation_batch_end', + 'validation_epoch_end', 'on_validation_epoch_end', 'on_epoch_end', 'on_validation_end', @@ -538,9 +577,18 @@ def teardown(self, stage=None): ] assert model.called == expected + +def test_trainer_model_hook_system_test(tmpdir): model = HookedModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_test_batches=1, + progress_bar_refresh_rate=0, + weights_summary=None, + ) + assert model.called == [] trainer.test(model, verbose=False) - expected = [ 'setup_test', 'on_test_model_eval', @@ -676,30 +724,50 @@ def on_after_batch_transfer(self, *args, **kwargs): reload_dataloaders_every_epoch=True, ) trainer.fit(model, datamodule=dm) - expected = [ - 'prepare_data', 'setup_fit', 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', - 'on_after_batch_transfer', 'train_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', - 'on_after_batch_transfer', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', - 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', - 'teardown_fit' + 'prepare_data', + 'setup_fit', + 'val_dataloader', + 'on_before_batch_transfer', + 'transfer_batch_to_device', + 'on_after_batch_transfer', + 'train_dataloader', + 'on_before_batch_transfer', + 'transfer_batch_to_device', + 'on_after_batch_transfer', + 'on_before_batch_transfer', + 'transfer_batch_to_device', + 'on_after_batch_transfer', + 'val_dataloader', + 'on_before_batch_transfer', + 'transfer_batch_to_device', + 'on_after_batch_transfer', + 'teardown_fit', ] assert dm.called == expected dm = HookedDataModule() trainer.validate(model, datamodule=dm, verbose=False) - expected = [ - 'prepare_data', 'setup_validate', 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', - 'on_after_batch_transfer', 'teardown_validate' + 'prepare_data', + 'setup_validate', + 'val_dataloader', + 'on_before_batch_transfer', + 'transfer_batch_to_device', + 'on_after_batch_transfer', + 'teardown_validate', ] assert dm.called == expected dm = HookedDataModule() trainer.test(model, datamodule=dm, verbose=False) - expected = [ - 'prepare_data', 'setup_test', 'test_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', - 'on_after_batch_transfer', 'teardown_test' + 'prepare_data', + 'setup_test', + 'test_dataloader', + 'on_before_batch_transfer', + 'transfer_batch_to_device', + 'on_after_batch_transfer', + 'teardown_test', ] assert dm.called == expected From c8500d7a2388e57cdf153bd8f67f4ad69a97caad Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 24 May 2021 17:35:54 +0200 Subject: [PATCH 07/32] Fix tests --- pytorch_lightning/trainer/training_loop.py | 29 +++++++++++----------- tests/models/test_hooks.py | 5 ++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 7707e40d5a611..d9148be8f551e 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -14,7 +14,7 @@ from collections import OrderedDict from contextlib import contextmanager, suppress -from copy import copy, deepcopy +from copy import copy from functools import partial, update_wrapper from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -34,7 +34,6 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.parsing import AttributeDict from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature -from pytorch_lightning.utilities.types import _METRIC from pytorch_lightning.utilities.warnings import WarningCache @@ -874,30 +873,30 @@ def should_accumulate(self): is_final_batch = self._num_training_batches_reached() return not (accumulation_done or is_final_batch) - def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool, on_epoch: bool = False) -> bool: + def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool: """ Decide if we should run validation. """ - if not self.trainer.enable_validation: return False - # check if this epoch is eligible to run validation - if (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch != 0: + is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch != 1 + if not is_val_check_epoch: return False + is_infinite_dataset = self.trainer.val_check_batch == float('inf') + if is_last_batch and is_infinite_dataset: + return True + + if self.trainer.should_stop: + return True + # val_check_batch is inf for iterable datasets with no length defined # TODO: let training/eval loop handle logic around limit_*_batches and val_check_batch - is_val_check_batch = False - if isinstance(self.trainer.limit_train_batches, int) and self.trainer.val_check_batch == float('inf'): + is_val_check_batch = is_last_batch + if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset: is_val_check_batch = (batch_idx + 1) % self.trainer.limit_train_batches == 0 elif self.trainer.val_check_batch != float('inf'): is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0 - - is_last_batch_for_infinite_dataset = is_last_batch and self.trainer.val_check_batch == float("inf") - - if on_epoch: - return is_val_check_batch or self.trainer.should_stop or is_last_batch_for_infinite_dataset - else: - return is_val_check_batch + return is_val_check_batch def _build_kwargs(self, batch, batch_idx, opt_idx, hiddens): # enable not needing to add opt_idx to training_step diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index f549b5d2d8ecc..18e086f44e329 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -229,7 +229,7 @@ def train_dataloader(self): trainer.fit(model) -@pytest.mark.parametrize('max_epochs,batch_idx_', [(2, 5), (3, 8), (4, 12)]) +@pytest.mark.parametrize('max_epochs,batch_idx_', [(2, 5), (3, 8), (4, 70)]) def test_on_train_batch_start_hook(max_epochs, batch_idx_): class CurrentModel(BoringModel): @@ -241,12 +241,13 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx): model = CurrentModel() trainer = Trainer(max_epochs=max_epochs) trainer.fit(model) + assert len(model.val_dataloader()) < 70 if batch_idx_ > len(model.val_dataloader()) - 1: assert trainer.train_loop.batch_idx == len(model.val_dataloader()) - 1 assert trainer.global_step == len(model.val_dataloader()) * max_epochs else: assert trainer.train_loop.batch_idx == batch_idx_ - assert trainer.global_step == (batch_idx_ + 1) * max_epochs + assert trainer.global_step == batch_idx_ * max_epochs class HookedModel(BoringModel): From 0c2305dc9d032cb357d553b017ebf75beec03182 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 24 May 2021 17:37:07 +0200 Subject: [PATCH 08/32] Minor change --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d9148be8f551e..383ed3102d462 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -878,7 +878,7 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool: if not self.trainer.enable_validation: return False - is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch != 1 + is_val_check_epoch = (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch == 0 if not is_val_check_epoch: return False From f02a866dcc125afe438b928dde6c97b4ea78b256 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 01:40:53 +0200 Subject: [PATCH 09/32] Fix test --- pytorch_lightning/trainer/training_loop.py | 6 ------ tests/callbacks/test_early_stopping.py | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 383ed3102d462..ac29088f838c2 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -539,9 +539,6 @@ def run_training_epoch(self): # dataloader/iterator did not produce a batch return - # TODO - # self.global_step -= 1 - # handle epoch_output on epoch end self.on_train_epoch_end(epoch_output) @@ -556,9 +553,6 @@ def run_training_epoch(self): # if validation has run, this should have been called already self.check_checkpoint_callback(True) - # TODO - # self.global_step += 1 - def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None: # inform logger the batch loop has finished self.trainer.logger_connector.on_train_epoch_end() diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index 8f89bedeb4f38..7d303e6ed00d6 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -157,7 +157,7 @@ def test_early_stopping_patience_train( """Test to ensure that early stopping is not triggered before patience is exhausted.""" class ModelOverrideTrainReturn(BoringModel): - train_return_values = torch.Tensor(loss_values) + train_return_values = torch.tensor(loss_values) def training_epoch_end(self, outputs): loss = self.train_return_values[self.current_epoch] @@ -169,7 +169,7 @@ def training_epoch_end(self, outputs): model.validation_step = None early_stop_callback = EarlyStopping( - monitor="train_loss", patience=patience, verbose=True, check_on_train_epoch_end=validation_step_none + monitor="train_loss", patience=patience, verbose=True, check_on_train_epoch_end=True ) trainer = Trainer( default_root_dir=tmpdir, From fa31597073a56e6d86cd928cb19dd4d2d54566f2 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 01:47:00 +0200 Subject: [PATCH 10/32] Increment the total batch idx before the accumulation early exit --- pytorch_lightning/trainer/training_loop.py | 4 ++-- tests/tuner/test_lr_finder.py | 27 ++++++++-------------- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index ca50b088c665b..10b727edbc93a 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -529,6 +529,8 @@ def run_training_epoch(self): self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics) self.trainer.checkpoint_connector.has_trained = True + self.total_batch_idx += 1 + # max steps reached, end training if ( self.max_steps is not None and self.max_steps <= self.global_step + 1 @@ -542,8 +544,6 @@ def run_training_epoch(self): if self.trainer.should_stop: break - self.total_batch_idx += 1 - # stop epoch if we limited the number of training batches if self._num_training_batches_reached(is_last_batch): break diff --git a/tests/tuner/test_lr_finder.py b/tests/tuner/test_lr_finder.py index 608cb8c6778bf..d4e5ef1862020 100644 --- a/tests/tuner/test_lr_finder.py +++ b/tests/tuner/test_lr_finder.py @@ -197,31 +197,24 @@ def test_datamodule_parameter(tmpdir): def test_accumulation_and_early_stopping(tmpdir): - """ Test that early stopping of learning rate finder works, and that - accumulation also works for this feature """ + """ Test that early stopping of learning rate finder works, and that accumulation also works for this feature """ - hparams = EvalModelTemplate.get_default_hparams() - model = EvalModelTemplate(**hparams) + class TestModel(BoringModel): - before_lr = hparams.get('learning_rate') - # logger file to get meta + def __init__(self): + super().__init__() + self.lr = 1e-3 + + model = TestModel() trainer = Trainer( default_root_dir=tmpdir, accumulate_grad_batches=2, ) - lrfinder = trainer.tuner.lr_find(model, early_stop_threshold=None) - after_lr = lrfinder.suggestion() - expected_num_lrs = 100 - expected_batch_idx = 200 - 1 - - assert before_lr != after_lr, \ - 'Learning rate was not altered after running learning rate finder' - assert len(lrfinder.results['lr']) == expected_num_lrs, \ - 'Early stopping for learning rate finder did not work' - assert lrfinder._total_batch_idx == expected_batch_idx, \ - 'Accumulation parameter did not work' + assert 1e-3 != lrfinder.suggestion() + assert len(lrfinder.results['lr']) == 100 + assert lrfinder._total_batch_idx == 200 def test_suggestion_parameters_work(tmpdir): From 64c49c11346cc383af36c70eca8ba9424e45e663 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 01:52:12 +0200 Subject: [PATCH 11/32] Update CHANGELOG --- CHANGELOG.md | 3 +++ tests/tuner/test_lr_finder.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ccb83f41ca84..14dbee08920be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -127,6 +127,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed global step update when the epoch is skipped ([#7677](https://github.com/PyTorchLightning/pytorch-lightning/pull/7677)) +- Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692)) + + - Fixed broadcasting in multi-node, multi-gpu DDP using torch 1.7 ([#7592](https://github.com/PyTorchLightning/pytorch-lightning/pull/7592)) diff --git a/tests/tuner/test_lr_finder.py b/tests/tuner/test_lr_finder.py index d4e5ef1862020..a74af3862c473 100644 --- a/tests/tuner/test_lr_finder.py +++ b/tests/tuner/test_lr_finder.py @@ -212,7 +212,7 @@ def __init__(self): ) lrfinder = trainer.tuner.lr_find(model, early_stop_threshold=None) - assert 1e-3 != lrfinder.suggestion() + assert lrfinder.suggestion() != 1e-3 assert len(lrfinder.results['lr']) == 100 assert lrfinder._total_batch_idx == 200 From a3d328f8fc61ac576c9a202a0d42e2c5434a7cf7 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 02:27:03 +0200 Subject: [PATCH 12/32] Fix test --- pytorch_lightning/trainer/training_loop.py | 6 +++ tests/loggers/test_tensorboard.py | 49 ++++++---------------- 2 files changed, 18 insertions(+), 37 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index ac29088f838c2..dce38ef3f82c6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -542,8 +542,14 @@ def run_training_epoch(self): # handle epoch_output on epoch end self.on_train_epoch_end(epoch_output) + # the global step is manually decreased here due to backwards compatibility with existing loggers + # as they expect that the same step is used for the following epoch_end calls even when the batch loop has + # finished. this means the attribute does not exactly track the number of optimizer steps applied. + # TODO(@carmocca): deprecate and rename so users don't get confused + self.global_step -= 1 # log epoch metrics self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output) + self.global_step += 1 self.update_lr_schedulers('epoch') diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index f22cdcfe2bba4..ed16bfba8c5ba 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -264,67 +264,42 @@ def test_tensorboard_log_graph_warning_no_example_input_array(tmpdir): @mock.patch('pytorch_lightning.loggers.TensorBoardLogger.log_metrics') -@pytest.mark.parametrize('expected', [ - ([5, 11, 17]), -]) -def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmpdir): - """ - Tests to ensure that tensorboard log properly when accumulated_gradients > 1 - """ +def test_tensorboard_with_a3ccummulated_gradients(mock_log_metrics, tmpdir): + """Tests to ensure that tensorboard log properly when accumulated_gradients > 1""" class TestModel(BoringModel): def __init__(self): super().__init__() - self._count = 0 - self._indexes = [] - - def training_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - self.log('count', self._count, on_step=True, on_epoch=True) - self.log('loss', loss, on_step=True, on_epoch=True) + self.indexes = [] + def training_step(self, *args): + self.log('foo', 1, on_step=True, on_epoch=True) if not self.trainer.train_loop.should_accumulate(): if self.trainer.logger_connector.should_update_logs: - self._indexes.append(self.trainer.global_step) - - return loss - - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - self.log('val_loss', loss, on_step=True, on_epoch=True) - return loss - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=.001) - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) - return [optimizer], [lr_scheduler] + self.indexes.append(self.trainer.global_step) + return super().training_step(*args) model = TestModel() model.training_epoch_end = None - model.validation_epoch_end = None - logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False) - trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=12, limit_val_batches=0, max_epochs=3, - gpus=0, accumulate_grad_batches=2, logger=[logger_0], log_every_n_steps=3, ) trainer.fit(model) - mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]] - assert mock_count_epochs == expected + calls = [m[2] for m in mock_log_metrics.mock_calls] + count_epochs = [c["step"] for c in calls if "foo_epoch" in c["metrics"]] + assert count_epochs == [5, 11, 17] - mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]] - assert model._indexes == mock_count_steps + count_steps = [c["step"] for c in calls if "foo_step" in c["metrics"]] + assert count_steps == model.indexes @mock.patch('pytorch_lightning.loggers.tensorboard.SummaryWriter') From 5a00cec4755e9b8a5eb6f7d42c92168bf1f5870b Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 02:30:13 +0200 Subject: [PATCH 13/32] Comment --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index dce38ef3f82c6..12be2ce067524 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -543,7 +543,7 @@ def run_training_epoch(self): self.on_train_epoch_end(epoch_output) # the global step is manually decreased here due to backwards compatibility with existing loggers - # as they expect that the same step is used for the following epoch_end calls even when the batch loop has + # as they expect that the same step is used when logging epoch end metrics even when the batch loop has # finished. this means the attribute does not exactly track the number of optimizer steps applied. # TODO(@carmocca): deprecate and rename so users don't get confused self.global_step -= 1 From ca7804fb9c3cd9952509277153efabf93a6054ce Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 03:39:13 +0200 Subject: [PATCH 14/32] Fix test --- tests/checkpointing/test_model_checkpoint.py | 45 +++++++++----------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 3d7a35917e095..0bdba7ce0c3da 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -180,23 +180,21 @@ def test_model_checkpoint_score_and_ckpt_val_check_interval( max_epochs = 3 limit_train_batches = 12 limit_val_batches = 7 - lr = 1e-1 + lr, gamma = 1e-1, 2 monitor = 'val_log' - per_epoch_steps = int(limit_train_batches * val_check_interval) - per_epoch_call_count = limit_train_batches // per_epoch_steps - left_over_steps = limit_train_batches % per_epoch_steps + per_epoch_val_checks = int(limit_train_batches * val_check_interval) + per_val_check_steps, leftover_steps = divmod(limit_train_batches, per_epoch_val_checks) class CustomBoringModel(BoringModel): def __init__(self): super().__init__() - self.val_logs = torch.randn(per_epoch_call_count * max_epochs, limit_val_batches) + self.val_logs = torch.randn(per_val_check_steps * max_epochs, limit_val_batches) self.val_loop_count = 0 def validation_step(self, batch, batch_idx): log_value = self.val_logs[self.val_loop_count, batch_idx] self.log('val_log', log_value) - self.log('epoch', self.current_epoch, on_epoch=True) return super().validation_step(batch, batch_idx) def validation_epoch_end(self, outputs): @@ -213,7 +211,7 @@ def configure_optimizers(self): 'strict': True, } else: - lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1) + lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma) return [optimizer], [lr_scheduler] @@ -241,26 +239,27 @@ def configure_optimizers(self): # on_train_end ckpt callback is called which creates an additional ckpt in case no ckpt is created at the # end of epoch, thus if val_check_interval doesn't align with the training steps we create an additional ckpt - additional_ckpt, additional_ckpt_path = 0, None + additional_ckpt, additional_ckpt_path = False, None if not epoch_aligned: additional_ckpt_path = [f for f in ckpt_files if 'v1' in f.stem][0] - additional_ckpt = 1 + additional_ckpt = True - additional_ckpt = 1 if not epoch_aligned else 0 - assert len(ckpt_files) == len(scores) + additional_ckpt == per_epoch_call_count * max_epochs + additional_ckpt + assert len(ckpt_files) == len(scores) + additional_ckpt == per_val_check_steps * max_epochs + additional_ckpt assert len(lr_scheduler_debug) == max_epochs - def _make_assertions(epoch, ix, add=''): - global_ix = ix + per_epoch_call_count * epoch + def _make_assertions(epoch, ix, version=''): + global_ix = ix + per_val_check_steps * epoch + duplicated = bool(version) + score = scores[global_ix] expected_score = getattr(model, f'{monitor}s')[global_ix].mean().item() - expected_filename = f'{monitor}={score:.4f}-epoch={epoch}{add}.ckpt' + expected_filename = f'{monitor}={score:.4f}-epoch={epoch}{version}.ckpt' assert math.isclose(score, expected_score, rel_tol=1e-4) chk = pl_load(os.path.join(checkpoint.dirpath, expected_filename)) assert chk['epoch'] == epoch + 1 - epoch_num = epoch + (1 if add else 0) - expected_global_step = per_epoch_steps * (global_ix + 1) + (left_over_steps * epoch_num) + epoch_num = epoch + duplicated + expected_global_step = per_epoch_val_checks * (global_ix + 1) + (leftover_steps * epoch_num) assert chk['global_step'] == expected_global_step mc_specific_data = chk['callbacks'][type(checkpoint)] @@ -269,15 +268,15 @@ def _make_assertions(epoch, ix, add=''): assert mc_specific_data['current_score'] == score if not reduce_lr_on_plateau: - lr_scheduler_specific_data = chk['lr_schedulers'][0] - did_update = 1 if (ix + 1 == per_epoch_call_count) and (epoch_aligned or add) else 0 - assert lr_scheduler_specific_data['_step_count'] == epoch + 1 + did_update - assert lr_scheduler_specific_data['_last_lr'][0] == lr * (lr**(epoch + did_update)) + actual_step_count = chk['lr_schedulers'][0]['_step_count'] + actual_lr = chk['lr_schedulers'][0]['_last_lr'][0] + assert actual_step_count == epoch + 1 + duplicated + assert actual_lr == lr * gamma**(epoch + duplicated) return score for epoch in range(max_epochs): - for i in range(per_epoch_call_count): + for i in range(per_val_check_steps): score = _make_assertions(epoch, i) assert lr_scheduler_debug[epoch]['monitor_val'] == (score if reduce_lr_on_plateau else None) @@ -285,9 +284,7 @@ def _make_assertions(epoch, ix, add=''): # check the ckpt file saved on_train_end if additional_ckpt_path: - epoch = max_epochs - 1 - i = per_epoch_call_count - 1 - _make_assertions(epoch, i, add='-v1') + _make_assertions(max_epochs - 1, per_val_check_steps - 1, version='-v1') @pytest.mark.parametrize("save_top_k", [-1, 0, 1, 2]) From d91eaf4b55cf6f2ba7199453bdb9a213ca7f613c Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 04:40:55 +0200 Subject: [PATCH 15/32] Fix ModelCheckpoint tests --- pytorch_lightning/trainer/training_loop.py | 10 ++++++---- tests/checkpointing/test_model_checkpoint.py | 16 +++++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 12be2ce067524..08bbfb709d5b6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -553,11 +553,13 @@ def run_training_epoch(self): self.update_lr_schedulers('epoch') - should_skip_eval = self.trainer.evaluation_loop.should_skip_evaluation(self.trainer.num_val_batches) - should_train_only = self.trainer.disable_validation or should_skip_eval - if should_train_only: - # if validation has run, this should have been called already + did_train_only = self.trainer.disable_validation or self.trainer.evaluation_loop.should_skip_evaluation( + self.trainer.num_val_batches + ) + if did_train_only: + self.global_step -= 1 self.check_checkpoint_callback(True) + self.global_step += 1 def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None: # inform logger the batch loop has finished diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 0bdba7ce0c3da..f26c578151b80 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -60,9 +60,8 @@ def validation_epoch_end(self, outputs): "validation_step_none,val_dataloaders_none,monitor", [ (False, False, 'val_log'), - (False, False, 'train_log_epoch'), (True, False, 'train_log_epoch'), - (False, True, 'train_log_epoch'), + (False, True, 'val_log'), ], ) @pytest.mark.parametrize('reduce_lr_on_plateau', [False, True]) @@ -76,7 +75,7 @@ def test_model_checkpoint_score_and_ckpt( max_epochs = 3 limit_train_batches = 5 limit_val_batches = 7 - lr = 1e-1 + lr, gamma = 1e-1, 2 class CustomBoringModel(BoringModel): @@ -106,7 +105,7 @@ def configure_optimizers(self): 'strict': True, } else: - lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1) + lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma) return [optimizer], [lr_scheduler] @@ -153,9 +152,12 @@ def configure_optimizers(self): assert mc_specific_data['current_score'] == score if not reduce_lr_on_plateau: - lr_scheduler_specific_data = chk['lr_schedulers'][0] - assert lr_scheduler_specific_data['_step_count'] == epoch + 2 - assert lr_scheduler_specific_data['_last_lr'][0] == lr * (lr**(epoch + 1)) + actual_step_count = chk['lr_schedulers'][0]['_step_count'] + actual_lr = chk['lr_schedulers'][0]['_last_lr'][0] + # if validation_step_none, the checkpoint gets saved after the learning rate update + # so we need to increase the count by one + assert actual_step_count == epoch + 1 + validation_step_none + assert actual_lr == lr * gamma**(epoch + validation_step_none) assert lr_scheduler_debug[epoch]['monitor_val'] == (score if reduce_lr_on_plateau else None) assert lr_scheduler_debug[epoch]['monitor_key'] == (monitor if reduce_lr_on_plateau else None) From a66b96fb5a3082ae1df73bef8279bb994ea0ac4c Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 11:59:27 +0200 Subject: [PATCH 16/32] Update test --- tests/models/test_hooks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 7816bc74829ea..1298248510673 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -517,7 +517,6 @@ def test_trainer_model_hook_system_fit_no_val(tmpdir): 'on_train_epoch_end', 'on_epoch_end', 'on_save_checkpoint', # from train epoch end - 'on_save_checkpoint', # from on train end 'on_train_end', 'on_fit_end', 'teardown_fit', From 2eb31ed3f432021379f7e158bab3177f7af80310 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 13:37:47 +0200 Subject: [PATCH 17/32] Conflicts --- pl_examples/bug_report_model.py | 1 + pytorch_lightning/trainer/training_loop.py | 10 ++-------- tests/checkpointing/test_model_checkpoint.py | 16 ++++++++-------- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index abb65ba86fd93..70591f3649fd9 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -1,4 +1,5 @@ import os +from unittest.mock import MagicMock import torch from torch.utils.data import DataLoader, Dataset diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 90cbb0abfe23c..09a32c3c96aad 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -884,6 +884,7 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool: if not is_val_check_epoch: return False + # val_check_batch is inf for iterable datasets with no length defined is_infinite_dataset = self.trainer.val_check_batch == float('inf') if is_last_batch and is_infinite_dataset: return True @@ -891,19 +892,12 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool: if self.trainer.should_stop: return True - # val_check_batch is inf for iterable datasets with no length defined - is_infinite_dataset = self.trainer.val_check_batch == float('inf') - if on_epoch and is_last_batch and is_infinite_dataset: - return True - - if on_epoch and self.trainer.should_stop: - return True - # TODO: let training/eval loop handle logic around limit_*_batches and val_check_batch is_val_check_batch = is_last_batch if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset: is_val_check_batch = (batch_idx + 1) % self.trainer.limit_train_batches == 0 elif self.trainer.val_check_batch != float('inf'): + is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0 return is_val_check_batch def _build_kwargs(self, batch, batch_idx, opt_idx, hiddens): diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index f26c578151b80..2f867d4e998b4 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -184,14 +184,14 @@ def test_model_checkpoint_score_and_ckpt_val_check_interval( limit_val_batches = 7 lr, gamma = 1e-1, 2 monitor = 'val_log' - per_epoch_val_checks = int(limit_train_batches * val_check_interval) - per_val_check_steps, leftover_steps = divmod(limit_train_batches, per_epoch_val_checks) + per_val_train_batches = int(limit_train_batches * val_check_interval) + per_epoch_val_checks, leftover_train_batches = divmod(limit_train_batches, per_val_train_batches) class CustomBoringModel(BoringModel): def __init__(self): super().__init__() - self.val_logs = torch.randn(per_val_check_steps * max_epochs, limit_val_batches) + self.val_logs = torch.randn(per_epoch_val_checks * max_epochs, limit_val_batches) self.val_loop_count = 0 def validation_step(self, batch, batch_idx): @@ -246,11 +246,11 @@ def configure_optimizers(self): additional_ckpt_path = [f for f in ckpt_files if 'v1' in f.stem][0] additional_ckpt = True - assert len(ckpt_files) == len(scores) + additional_ckpt == per_val_check_steps * max_epochs + additional_ckpt + assert len(ckpt_files) == len(scores) + additional_ckpt == per_epoch_val_checks * max_epochs + additional_ckpt assert len(lr_scheduler_debug) == max_epochs def _make_assertions(epoch, ix, version=''): - global_ix = ix + per_val_check_steps * epoch + global_ix = ix + per_epoch_val_checks * epoch duplicated = bool(version) score = scores[global_ix] @@ -261,7 +261,7 @@ def _make_assertions(epoch, ix, version=''): chk = pl_load(os.path.join(checkpoint.dirpath, expected_filename)) assert chk['epoch'] == epoch + 1 epoch_num = epoch + duplicated - expected_global_step = per_epoch_val_checks * (global_ix + 1) + (leftover_steps * epoch_num) + expected_global_step = per_val_train_batches * (global_ix + 1) + (leftover_train_batches * epoch_num) assert chk['global_step'] == expected_global_step mc_specific_data = chk['callbacks'][type(checkpoint)] @@ -278,7 +278,7 @@ def _make_assertions(epoch, ix, version=''): return score for epoch in range(max_epochs): - for i in range(per_val_check_steps): + for i in range(per_epoch_val_checks): score = _make_assertions(epoch, i) assert lr_scheduler_debug[epoch]['monitor_val'] == (score if reduce_lr_on_plateau else None) @@ -286,7 +286,7 @@ def _make_assertions(epoch, ix, version=''): # check the ckpt file saved on_train_end if additional_ckpt_path: - _make_assertions(max_epochs - 1, per_val_check_steps - 1, version='-v1') + _make_assertions(max_epochs - 1, per_epoch_val_checks - 1, version='-v1') @pytest.mark.parametrize("save_top_k", [-1, 0, 1, 2]) From efa15296bde06ba4e8b8e34f835804a42ea97006 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 13:39:37 +0200 Subject: [PATCH 18/32] Bad merge --- tests/models/test_hooks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 4ace7b89e4251..7608d890236be 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -283,6 +283,10 @@ def validation_epoch_end(self, *args, **kwargs): self.called.append("validation_epoch_end") super().validation_epoch_end(*args, **kwargs) + def on_before_zero_grad(self, *args, **kwargs): + self.called.append("on_before_zero_grad") + super().on_before_zero_grad(*args, **kwargs) + def on_epoch_start(self): self.called.append("on_epoch_start") super().on_epoch_start() From 00ccee0349ee68aea70761bf56fa1759e6c015da Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 13:40:43 +0200 Subject: [PATCH 19/32] Bad merge --- tests/models/test_hooks.py | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 7608d890236be..dca56a39cd274 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -524,39 +524,6 @@ def test_trainer_model_hook_system_fit_no_val(tmpdir): assert model.called == expected -def test_trainer_model_hook_system_fit_no_val(tmpdir): - model = HookedModel() - train_batches = 2 - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_val_batches=0, - limit_train_batches=train_batches, - progress_bar_refresh_rate=0, - weights_summary=None, - ) - assert model.called == [] - trainer.fit(model) - expected = [ - 'setup_fit', - 'on_fit_start', - 'on_pretrain_routine_start', - 'on_pretrain_routine_end', - 'on_train_start', - 'on_epoch_start', - 'on_train_epoch_start', - *(model.train_batch * train_batches), - 'training_epoch_end', - 'on_train_epoch_end', - 'on_epoch_end', - 'on_save_checkpoint', # from train epoch end - 'on_train_end', - 'on_fit_end', - 'teardown_fit', - ] - assert model.called == expected - - def test_trainer_model_hook_system_validate(tmpdir): model = HookedModel() trainer = Trainer( From deaa887f776d2af2bd2fe939e64eaeb9beed753a Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 13:41:03 +0200 Subject: [PATCH 20/32] Unrelated change --- pl_examples/bug_report_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index 70591f3649fd9..abb65ba86fd93 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -1,5 +1,4 @@ import os -from unittest.mock import MagicMock import torch from torch.utils.data import DataLoader, Dataset From fccd7c4727bac86d3abdd60b5313b169d4deb7c8 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 13:41:32 +0200 Subject: [PATCH 21/32] Unrelated change --- tests/models/test_hooks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index dca56a39cd274..913f403a14dd3 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -502,7 +502,6 @@ def test_trainer_model_hook_system_fit_no_val(tmpdir): weights_summary=None, ) assert model.called == [] - trainer.fit(model) expected = [ 'setup_fit', From 747a39912203c6dd0865e0152e7c43acb5b25f67 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 17:20:32 +0200 Subject: [PATCH 22/32] Remove comment --- tests/trainer/loops/test_training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py index 2e17f57ec9479..da4ecbe5a9f05 100644 --- a/tests/trainer/loops/test_training_loop.py +++ b/tests/trainer/loops/test_training_loop.py @@ -141,4 +141,4 @@ def validation_step(self, *args): assert trainer.current_epoch == 0 assert trainer.global_step == 5 - assert model.validation_called_at == (0, 4) # TODO(@carmocca): should be 5 - will be fixed in next PR + assert model.validation_called_at == (0, 4) From a2943e5f60b89bd4ac9ce6f362b93219ca2fe09f Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 17:45:29 +0200 Subject: [PATCH 23/32] Add `ModelPruning(prune_on_train_epoch_end=bool)` to choose when to apply pruning --- pytorch_lightning/callbacks/pruning.py | 22 ++++++++++++++++++---- tests/callbacks/test_pruning.py | 13 +++++++++---- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/callbacks/pruning.py b/pytorch_lightning/callbacks/pruning.py index b044fc3e6e38a..e7da752d1c844 100644 --- a/pytorch_lightning/callbacks/pruning.py +++ b/pytorch_lightning/callbacks/pruning.py @@ -71,6 +71,7 @@ def __init__( pruning_dim: Optional[int] = None, pruning_norm: Optional[int] = None, verbose: int = 0, + prune_on_train_epoch_end: bool = True, ) -> None: """ Model pruning Callback, using PyTorch's prune utilities. @@ -141,6 +142,9 @@ def __init__( verbose: Verbosity level. 0 to disable, 1 to log overall sparsity, 2 to log per-layer sparsity + prune_on_train_epoch_end: whether to apply pruning at the end of the training epoch. + If this is ``False``, then the check runs at the end of the validation epoch. + Raises: MisconfigurationException: If ``parameter_names`` is neither ``"weight"`` nor ``"bias"``, @@ -155,6 +159,7 @@ def __init__( self._parameters_to_prune = parameters_to_prune self._use_lottery_ticket_hypothesis = use_lottery_ticket_hypothesis self._resample_parameters = resample_parameters + self._prune_on_train_epoch_end = prune_on_train_epoch_end self._parameter_names = parameter_names or self.PARAMETER_NAMES self._global_kwargs: Dict[str, Any] = {} self._original_layers: Optional[Dict[int, _LayerRef]] = None @@ -381,8 +386,7 @@ def on_before_accelerator_backend_setup(self, trainer: 'pl.Trainer', pl_module: self._original_layers.setdefault(id_, _LayerRef(data=deepcopy(module), names=[])) self._original_layers[id_]["names"].append((i, name)) - def on_train_epoch_end(self, trainer: 'pl.Trainer', pl_module: LightningModule) -> None: # type: ignore - current_epoch = pl_module.current_epoch + def _run_pruning(self, current_epoch: int) -> None: prune = self._apply_pruning(current_epoch) if callable(self._apply_pruning) else self._apply_pruning amount = self.amount(current_epoch) if callable(self.amount) else self.amount if not prune or not amount: @@ -395,9 +399,19 @@ def on_train_epoch_end(self, trainer: 'pl.Trainer', pl_module: LightningModule) ): self.apply_lottery_ticket_hypothesis() + def on_train_epoch_end(self, trainer: 'pl.Trainer', pl_module: LightningModule) -> None: # type: ignore + if self._prune_on_train_epoch_end: + rank_zero_debug("`ModelPruning.on_train_epoch_end`. Applying pruning") + self._run_pruning(pl_module.current_epoch) + + def on_validation_epoch_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None: + if not trainer.sanity_checking and not self._prune_on_train_epoch_end: + rank_zero_debug("`ModelPruning.on_validation_epoch_end`. Applying pruning") + self._run_pruning(pl_module.current_epoch) + def on_train_end(self, trainer: 'pl.Trainer', pl_module: LightningModule) -> None: if self._make_pruning_permanent: - rank_zero_debug("`ModelPruning.on_train_end`. Pruning is made permanent for this checkpoint.") + rank_zero_debug("`ModelPruning.on_train_end`. Pruning is made permanent for this checkpoint") self.make_pruning_permanent(pl_module) def on_save_checkpoint( @@ -407,7 +421,7 @@ def on_save_checkpoint( checkpoint: Dict[str, Any], ) -> Dict[str, Any]: if self._make_pruning_permanent: - rank_zero_debug("`ModelPruning.on_save_checkpoint`. Pruning is made permanent for this checkpoint.") + rank_zero_debug("`ModelPruning.on_save_checkpoint`. Pruning is made permanent for this checkpoint") prev_device = pl_module.device # prune a copy so training can continue with the same buffers copy = deepcopy(pl_module.to("cpu")) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 5a06a92a13fa8..5710e9d1166c0 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -270,7 +270,8 @@ def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool assert not has_pruning if make_pruning_permanent else has_pruning -def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog): +@pytest.mark.parametrize("on_train_epoch_end", (False, True)) +def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog, on_train_epoch_end): """ When a model is saved multiple times and make_permanent=True, we need to make sure a copy is pruned and not the trained model if we want to continue @@ -282,15 +283,19 @@ class TestPruning(ModelPruning): def on_save_checkpoint(self, trainer, pl_module, checkpoint): super().on_save_checkpoint(trainer, pl_module, checkpoint) - assert "layer.mlp_3.weight_orig" not in checkpoint["state_dict"] - assert hasattr(pl_module.layer.mlp_3, "weight_orig") + if not on_train_epoch_end: + # these checks only work if pruning on `validation_epoch_end` + # because `on_save_checkpoint` is called before `on_train_epoch_end` + assert "layer.mlp_3.weight_orig" not in checkpoint["state_dict"] + assert hasattr(pl_module.layer.mlp_3, "weight_orig") model = TestModel() pruning_callback = TestPruning( "random_unstructured", parameters_to_prune=[(model.layer.mlp_3, "weight")], verbose=1, - make_pruning_permanent=True + make_pruning_permanent=True, + prune_on_train_epoch_end=on_train_epoch_end, ) ckpt_callback = ModelCheckpoint(monitor="test", save_top_k=2, save_last=True) trainer = Trainer(callbacks=[pruning_callback, ckpt_callback], max_epochs=3, progress_bar_refresh_rate=0) From c819df5b95dae242680a4d0199dfe0d5be0c3bb7 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 17:50:19 +0200 Subject: [PATCH 24/32] Update CHANGELOG --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14dbee08920be..ab57e7bed6c0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added LightningCLI support for config files on object stores ([#7521](https://github.com/PyTorchLightning/pytorch-lightning/pull/7521)) +- Added `ModelPruning(prune_on_train_epoch_end)` to choose when to apply pruning ([#7704](https://github.com/PyTorchLightning/pytorch-lightning/pull/7704)) + + - Added support for checkpointing based on a provided time interval during training ([#7515](https://github.com/PyTorchLightning/pytorch-lightning/pull/7515)) From c0201bc74aeb65681c38b0b1b3e28cfbd14a0d1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 25 May 2021 18:15:15 +0200 Subject: [PATCH 25/32] Update CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab57e7bed6c0e..ea067090ba1b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added LightningCLI support for config files on object stores ([#7521](https://github.com/PyTorchLightning/pytorch-lightning/pull/7521)) -- Added `ModelPruning(prune_on_train_epoch_end)` to choose when to apply pruning ([#7704](https://github.com/PyTorchLightning/pytorch-lightning/pull/7704)) +- Added `ModelPruning(prune_on_train_epoch_end=True|False)` to choose when to apply pruning ([#7704](https://github.com/PyTorchLightning/pytorch-lightning/pull/7704)) - Added support for checkpointing based on a provided time interval during training ([#7515](https://github.com/PyTorchLightning/pytorch-lightning/pull/7515)) From 927ab83542d9db9c07ef7ccc18f933b47171eeef Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 18:37:14 +0200 Subject: [PATCH 26/32] Test with regex --- tests/callbacks/test_pruning.py | 35 ++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 5710e9d1166c0..0500fd6b4e6fc 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from collections import OrderedDict from logging import INFO from typing import Union @@ -250,17 +251,20 @@ def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool actual = [m.strip() for m in caplog.messages] actual = [m for m in actual if m.startswith("Applied")] - assert actual == [ - "Applied `L1Unstructured`. Pruned: 0/1122 (0.00%) -> 544/1122 (48.48%)", - "Applied `L1Unstructured` to `Linear(in_features=32, out_features=32, bias=True).weight` with amount=0.5. Pruned: 0 (0.00%) -> 503 (49.12%)", # noqa: E501 - "Applied `L1Unstructured` to `Linear(in_features=32, out_features=2, bias=True).weight` with amount=0.5. Pruned: 0 (0.00%) -> 41 (64.06%)", # noqa: E501 - "Applied `RandomUnstructured`. Pruned: 544/1122 (48.48%) -> 680/1122 (60.61%)", - "Applied `RandomUnstructured` to `Linear(in_features=32, out_features=32, bias=True).weight` with amount=0.25. Pruned: 503 (49.12%) -> 629 (61.43%)", # noqa: E501 - "Applied `RandomUnstructured` to `Linear(in_features=32, out_features=2, bias=True).weight` with amount=0.25. Pruned: 41 (64.06%) -> 51 (79.69%)", # noqa: E501 - "Applied `L1Unstructured`. Pruned: 680/1122 (60.61%) -> 884/1122 (78.79%)", - "Applied `L1Unstructured` to `Linear(in_features=32, out_features=32, bias=True).weight` with amount=0.5. Pruned: 629 (61.43%) -> 827 (80.76%)", # noqa: E501 - "Applied `L1Unstructured` to `Linear(in_features=32, out_features=2, bias=True).weight` with amount=0.5. Pruned: 51 (79.69%) -> 57 (89.06%)", # noqa: E501 + percentage = r"\d+(?:\.\d+)?%" + expected = [ + rf"Applied `L1Unstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)", + rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ \({percentage}\)", # noqa: E501 + rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ \({percentage}\)", # noqa: E501 + rf"Applied `RandomUnstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)", + rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.25. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)", # noqa: E501 + rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.25. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)", # noqa: E501 + rf"Applied `L1Unstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)", + rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)", # noqa: E501 + rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)", # noqa: E501 ] + expected = [re.compile(s) for s in expected] + assert all(regex.match(s) for s, regex in zip(actual, expected)) filepath = str(tmpdir / "foo.ckpt") trainer.save_checkpoint(filepath) @@ -304,11 +308,14 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): actual = [m.strip() for m in caplog.messages] actual = [m for m in actual if m.startswith("Applied")] - assert actual == [ - "Applied `RandomUnstructured`. Pruned: 0/66 (0.00%) -> 32/66 (48.48%)", - "Applied `RandomUnstructured`. Pruned: 32/66 (48.48%) -> 48/66 (72.73%)", - "Applied `RandomUnstructured`. Pruned: 48/66 (72.73%) -> 56/66 (84.85%)", + percentage = r"\d+(?:\.\d+)?%" + expected = [ + rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)", + rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)", + rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)", ] + expected = [re.compile(s) for s in expected] + assert all(regex.match(s) for s, regex in zip(actual, expected)) # removed on_train_end assert not hasattr(model.layer.mlp_3, "weight_orig") From 08a5c4bfd912d83a47cc5e802ae0d91ddbd506f9 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 18:38:59 +0200 Subject: [PATCH 27/32] Minor change --- tests/callbacks/test_pruning.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 0500fd6b4e6fc..65ec440dcee75 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -251,17 +251,17 @@ def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool actual = [m.strip() for m in caplog.messages] actual = [m for m in actual if m.startswith("Applied")] - percentage = r"\d+(?:\.\d+)?%" + percentage = r"\(\d+(?:\.\d+)?%\)" expected = [ - rf"Applied `L1Unstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)", - rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ \({percentage}\)", # noqa: E501 - rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ \({percentage}\)", # noqa: E501 - rf"Applied `RandomUnstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)", - rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.25. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)", # noqa: E501 - rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.25. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)", # noqa: E501 - rf"Applied `L1Unstructured`. Pruned: \d+\/1122 \({percentage}\) -> \d+\/1122 \({percentage}\)", - rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)", # noqa: E501 - rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: \d+ \({percentage}\) -> \d+ \({percentage}\)", # noqa: E501 + rf"Applied `L1Unstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}", + rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ {percentage}", # noqa: E501 + rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: 0 \(0.00%\) -> \d+ {percentage}", # noqa: E501 + rf"Applied `RandomUnstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}", + rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.25. Pruned: \d+ {percentage} -> \d+ {percentage}", # noqa: E501 + rf"Applied `RandomUnstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.25. Pruned: \d+ {percentage} -> \d+ {percentage}", # noqa: E501 + rf"Applied `L1Unstructured`. Pruned: \d+\/1122 {percentage} -> \d+\/1122 {percentage}", + rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=32, bias=True\).weight` with amount=0.5. Pruned: \d+ {percentage} -> \d+ {percentage}", # noqa: E501 + rf"Applied `L1Unstructured` to `Linear\(in_features=32, out_features=2, bias=True\).weight` with amount=0.5. Pruned: \d+ {percentage} -> \d+ {percentage}", # noqa: E501 ] expected = [re.compile(s) for s in expected] assert all(regex.match(s) for s, regex in zip(actual, expected)) @@ -308,11 +308,11 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): actual = [m.strip() for m in caplog.messages] actual = [m for m in actual if m.startswith("Applied")] - percentage = r"\d+(?:\.\d+)?%" + percentage = r"\(\d+(?:\.\d+)?%\)" expected = [ - rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)", - rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)", - rf"Applied `RandomUnstructured`. Pruned: \d+\/66 \({percentage}\) -> \d+\/66 \({percentage}\)", + rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}", + rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}", + rf"Applied `RandomUnstructured`. Pruned: \d+\/66 {percentage} -> \d+\/66 {percentage}", ] expected = [re.compile(s) for s in expected] assert all(regex.match(s) for s, regex in zip(actual, expected)) From b0c630f943466fe4b95940ed1964da5e42f015a7 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 May 2021 18:40:07 +0200 Subject: [PATCH 28/32] No need to seed anymore --- tests/callbacks/test_pruning.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 65ec440dcee75..d4957905454d8 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -22,7 +22,7 @@ from torch import nn from torch.nn import Sequential -from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint, ModelPruning from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel @@ -225,7 +225,6 @@ def apply_lottery_ticket_hypothesis(self): @pytest.mark.parametrize("make_pruning_permanent", (False, True)) def test_multiple_pruning_callbacks(tmpdir, caplog, make_pruning_permanent: bool): - seed_everything(0) model = TestModel() pruning_kwargs = { 'parameters_to_prune': [(model.layer.mlp_1, "weight"), (model.layer.mlp_3, "weight")], @@ -281,7 +280,6 @@ def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog, on_train_e make sure a copy is pruned and not the trained model if we want to continue with the same pruning buffers. """ - seed_everything(0) class TestPruning(ModelPruning): From 4284a63ec6cc5dd11ae7659c4a117eb21639a0b3 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 26 May 2021 01:42:05 +0200 Subject: [PATCH 29/32] Update CHANGELOG --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c07b13ac93612..9de22de41da69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,7 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506)) * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507)) * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526)) - + * Simplified "should run validation" logic ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682)) + * Refactored "should run validation" logic when the trainer is signaled to stop ([#7701](https://github.com/PyTorchLightning/pytorch-lightning/pull/7701)) + * Always run validation inside the training loop epoch ([#7357](https://github.com/PyTorchLightning/pytorch-lightning/pull/7357)) - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/)) From 09f5d39df1fbef6b5a28717cc9d54f41f39924cd Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 26 May 2021 02:54:06 +0200 Subject: [PATCH 30/32] Update CHANGELOG --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9de22de41da69..a065b846b4bb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed `clip_grad_norm` to use `torch.nn.utils.clip_grad_norm_` ([#7025](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025)) + +- Validation is now always run inside the training epoch scope ([#7357](https://github.com/PyTorchLightning/pytorch-lightning/pull/7357)) + + - Refactored Loops * Moved attributes `global_step`, `current_epoch`, `max/min_steps`, `max/min_epochs`, `batch_idx`, and `total_batch_idx` to TrainLoop ([#7437](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025)) * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506)) @@ -69,7 +73,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526)) * Simplified "should run validation" logic ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682)) * Refactored "should run validation" logic when the trainer is signaled to stop ([#7701](https://github.com/PyTorchLightning/pytorch-lightning/pull/7701)) - * Always run validation inside the training loop epoch ([#7357](https://github.com/PyTorchLightning/pytorch-lightning/pull/7357)) - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/)) From 95eeda127f71ba883bf764ca2c2de2c793c48961 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 26 May 2021 02:54:32 +0200 Subject: [PATCH 31/32] Update CHANGELOG --- CHANGELOG.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a065b846b4bb1..8dfcdbb08d346 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,8 +71,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506)) * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507)) * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526)) - * Simplified "should run validation" logic ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682)) - * Refactored "should run validation" logic when the trainer is signaled to stop ([#7701](https://github.com/PyTorchLightning/pytorch-lightning/pull/7701)) - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/)) From 1b18342ba47491df84053b0e9f5334630da50869 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 26 May 2021 02:55:44 +0200 Subject: [PATCH 32/32] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8dfcdbb08d346..199aa70329e24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,6 +72,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507)) * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526)) + - Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/))