From eafec7d425bf691ab5bfaf2794c3e581487ecfa8 Mon Sep 17 00:00:00 2001 From: Eugene Khvedchenya Date: Tue, 6 Apr 2021 10:40:29 +0300 Subject: [PATCH] Fix DPP + SyncBN (#6838) * Fix DPP + SyncBN Ensure that model is already on correct GPU before applying SyncBN conversion * Fix order of SyncBN for ddp_spawn --- pytorch_lightning/plugins/training_type/ddp.py | 6 +++--- pytorch_lightning/plugins/training_type/ddp_spawn.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 58e26e7db32d8..b8437b0d418e4 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -241,12 +241,12 @@ def init_ddp_connection(self, global_rank: int, world_size: int) -> None: torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) def pre_dispatch(self): - if self.sync_batchnorm: - self.model = self.configure_sync_batchnorm(self.model) - # move the model to the correct device self.model_to_device() + if self.sync_batchnorm: + self.model = self.configure_sync_batchnorm(self.model) + self.configure_ddp() self.barrier() diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 87d7fa5faecac..126afc9be6040 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -141,12 +141,12 @@ def new_process(self, process_idx, trainer, mp_queue): self.dist.rank = self.global_rank self.dist.device = self.root_device - if self.sync_batchnorm: - self.model = self.configure_sync_batchnorm(self.model) - # move the model to the correct device self.model_to_device() + if self.sync_batchnorm: + self.model = self.configure_sync_batchnorm(self.model) + self.configure_ddp() self.barrier()