Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bugfix] Accumulated_gradient and TensoBoard #4738

Merged
merged 29 commits into from
Nov 25, 2020
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
47efe42
resolve bug
tchaton Nov 18, 2020
e961b44
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 18, 2020
dbd740d
update
tchaton Nov 18, 2020
f5ef9f5
Merge branch 'bugfix/4304_tensorboard_accumulated_grad' of https://gi…
tchaton Nov 18, 2020
891d0ed
update
tchaton Nov 18, 2020
07816d8
modify one test
tchaton Nov 18, 2020
5030043
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 18, 2020
dfab61a
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 18, 2020
f571f4d
remove paramters
tchaton Nov 18, 2020
7d2ea62
Merge branch 'bugfix/4304_tensorboard_accumulated_grad' of https://gi…
tchaton Nov 18, 2020
24cee2c
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 18, 2020
de12d31
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 18, 2020
011b65f
update on comments
tchaton Nov 18, 2020
fdce1a9
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 18, 2020
c18a070
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 19, 2020
88e1afa
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 19, 2020
7de5c99
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 19, 2020
df6571b
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 20, 2020
f6d0f0a
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 20, 2020
ef357ca
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 20, 2020
b86fa58
update changelog
tchaton Nov 23, 2020
5898183
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 23, 2020
0dd02d0
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 23, 2020
a28bbec
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 23, 2020
fa3a57b
update docstring
tchaton Nov 24, 2020
3f7bb6a
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 24, 2020
21fe93a
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
SkafteNicki Nov 24, 2020
d8ac3c5
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 25, 2020
f5cb188
Merge branch 'master' into bugfix/4304_tensorboard_accumulated_grad
tchaton Nov 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

- Change Metrics `persistent` default mode to `False` ([#4685](https://github.com/PyTorchLightning/pytorch-lightning/pull/4685))

- LoggerConnector log_metrics will use `total_batch_idx` instead of `global_step` when logging on `training step` ([#4738](https://github.com/PyTorchLightning/pytorch-lightning/pull/4738))


### Fixed

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,14 @@ def cache_logged_metrics(self) -> Union[EpochResultStore, None]:
if self._current_stage is not None:
self._cached_results[self._current_stage].cache_result()

def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool):
def on_trainer_init(self, logger, flush_logs_every_n_steps: int,
log_every_n_steps: int, move_metrics_to_cpu: bool):
# logging
self.configure_logger(logger)
# todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders
# and assign here the desired value
# and assign here the desired value
tchaton marked this conversation as resolved.
Show resolved Hide resolved
self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps
self.trainer.log_every_n_steps = log_every_n_steps

self.trainer.move_metrics_to_cpu = move_metrics_to_cpu
self.trainer.split_idx = None

Expand Down Expand Up @@ -181,7 +181,7 @@ def cache_training_step_metrics(self, opt_closure_result):
self.logged_metrics.update(logged_metrics_tmp)
self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp)

def log_metrics(self, metrics, grad_norm_dic, step=None):
def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=False):
tchaton marked this conversation as resolved.
Show resolved Hide resolved
tchaton marked this conversation as resolved.
Show resolved Hide resolved
"""Logs the metric dict passed in.
If `step` parameter is None and `step` key is presented is metrics,
uses metrics["step"] as a step
Expand All @@ -190,6 +190,8 @@ def log_metrics(self, metrics, grad_norm_dic, step=None):
metrics (dict): Metric values
grad_norm_dic (dict): Gradient norms
step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
log_train_step_metrics (bool): Used to track if log_metrics function is being called in during training steps.
In training steps, we will log metrics on step: total_nb_idx (for accumulated gradients) and global_step for the rest.
"""
# add gpu memory
if self.trainer.on_gpu and self.trainer.log_gpu_memory:
Expand All @@ -207,8 +209,11 @@ def log_metrics(self, metrics, grad_norm_dic, step=None):

elif step is None:
# added metrics by Lightning for convenience
scalar_metrics['epoch'] = self.trainer.current_epoch
step = self.trainer.global_step
if log_train_step_metrics:
step = self.trainer.total_batch_idx
else:
scalar_metrics['epoch'] = self.trainer.current_epoch
step = self.trainer.global_step

# log actual metrics
if self.trainer.logger is not None:
Expand Down Expand Up @@ -619,5 +624,5 @@ def log_train_step_metrics(self, batch_output):
metrics = self.cached_results.get_latest_batch_log_metrics()
grad_norm_dic = batch_output.grad_norm_dic
if len(metrics) > 0 or len(grad_norm_dic) > 0:
self.log_metrics(metrics, grad_norm_dic)
self.log_metrics(metrics, grad_norm_dic, log_train_step_metrics=True)
self.callback_metrics.update(metrics)
48 changes: 24 additions & 24 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,48 +21,48 @@
import torch
from torch.utils.data import DataLoader

from pytorch_lightning import _logger as log
from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
from pytorch_lightning.core.datamodule import LightningDataModule
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.core.memory import ModelSummary
from pytorch_lightning.core.step_result import Result, EvalResult
from pytorch_lightning.core.step_result import EvalResult, Result
from pytorch_lightning.loggers import LightningLoggerBase
from pytorch_lightning.plugins.plugin_connector import PluginConnector
from pytorch_lightning.profiler import BaseProfiler
from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
from pytorch_lightning.trainer.configuration_validator import ConfigValidator
from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
from pytorch_lightning.trainer.connectors.data_connector import DataConnector
from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
from pytorch_lightning.trainer.connectors.env_vars_connector import overwrite_by_env_vars
from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
from pytorch_lightning.trainer.logging import TrainerLoggingMixin
from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin
from pytorch_lightning.trainer.properties import TrainerProperties
from pytorch_lightning.trainer.states import TrainerState, trainer_state
from pytorch_lightning.trainer.training_loop import TrainLoop
from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
from pytorch_lightning.tuner.tuning import Tuner
from pytorch_lightning.utilities import rank_zero_warn
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.debugging import InternalDebugger
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
from pytorch_lightning.trainer.training_loop import TrainLoop
from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
from pytorch_lightning import _logger as log
from pytorch_lightning.tuner.tuning import Tuner
from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
from pytorch_lightning.trainer.connectors.data_connector import DataConnector
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.model_utils import is_overridden
from pytorch_lightning.trainer.properties import TrainerProperties
from pytorch_lightning.plugins.plugin_connector import PluginConnector
from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator
from pytorch_lightning.utilities.memory import recursive_detach
from pytorch_lightning.utilities.model_utils import is_overridden

# warnings to ignore in trainer
warnings.filterwarnings(
Expand Down Expand Up @@ -385,7 +385,7 @@ def __init__(
logger,
flush_logs_every_n_steps,
log_every_n_steps,
move_metrics_to_cpu
move_metrics_to_cpu,
)

# init debugging flags
Expand Down
10 changes: 5 additions & 5 deletions tests/loggers/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
import pytest

import tests.base.develop_utils as tutils
from pytorch_lightning import Trainer, Callback
from pytorch_lightning import Callback, Trainer
from pytorch_lightning.loggers import (
TensorBoardLogger,
CometLogger,
MLFlowLogger,
NeptuneLogger,
TensorBoardLogger,
TestTubeLogger,
CometLogger,
WandbLogger,
)
from pytorch_lightning.loggers.base import DummyExperiment
Expand Down Expand Up @@ -124,15 +124,15 @@ def log_metrics(self, metrics, step):
if logger_class == TensorBoardLogger:
expected = [
(0, ['hp_metric']),
(0, ['epoch', 'train_some_val']),
(0, ['train_some_val']),
(0, ['early_stop_on', 'epoch', 'val_acc']),
(0, ['hp_metric']),
(1, ['epoch', 'test_acc', 'test_loss'])
]
assert log_metric_names == expected
else:
expected = [
(0, ['epoch', 'train_some_val']),
(0, ['train_some_val']),
(0, ['early_stop_on', 'epoch', 'val_acc']),
(1, ['epoch', 'test_acc', 'test_loss'])
]
Expand Down
63 changes: 62 additions & 1 deletion tests/loggers/test_tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import os
from argparse import Namespace
from distutils.version import LooseVersion
from unittest import mock

import pytest
import torch
Expand All @@ -23,7 +24,7 @@

from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import TensorBoardLogger
from tests.base import EvalModelTemplate, BoringModel
from tests.base import BoringModel, EvalModelTemplate


@pytest.mark.skipif(
Expand Down Expand Up @@ -201,3 +202,63 @@ def test_tensorboard_log_graph_warning_no_example_input_array(tmpdir):
' attribute is not set or `input_array` was not given'
):
logger.log_graph(model)


@mock.patch('pytorch_lightning.loggers.TensorBoardLogger.log_metrics')
@pytest.mark.parametrize('expected', [
([5, 11, 17]),
])
def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmpdir):
"""
Tests to ensure that tensorboard log properly when accumulated_gradients > 1
"""
class TestModel(BoringModel):
_count = 0
_indexes = []

def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log('count', self._count, on_step=True, on_epoch=True)
self.log('loss', loss, on_step=True, on_epoch=True)

if self.trainer.logger_connector.should_update_logs:
self._indexes.append(self._count)

self._count += 1
return loss

def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log('val_loss', loss, on_step=True, on_epoch=True)
return loss

def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=.001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
return [optimizer], [lr_scheduler]

model = TestModel()
model.training_epoch_end = None
model.validation_epoch_end = None

logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False)

accumulate_grad_batches = 2
trainer = Trainer(
default_root_dir=tmpdir,
limit_train_batches=12,
limit_val_batches=12,
max_epochs=3,
gpus=0,
accumulate_grad_batches=accumulate_grad_batches,
logger=[logger_0],
log_every_n_steps=3,
)
trainer.fit(model)

mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]]
assert mock_count_epochs == expected
mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]]
assert model._indexes == mock_count_steps