Skip to content

Commit

Permalink
resurface lost ddp info message (#8111)
Browse files Browse the repository at this point in the history
  • Loading branch information
awaelchli authored and lexierule committed Jul 1, 2021
1 parent f56df26 commit 26c80c5
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 16 deletions.
17 changes: 9 additions & 8 deletions pytorch_lightning/plugins/training_type/ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
_TORCH_GREATER_EQUAL_1_8,
rank_zero_warn,
)
from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available
from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only, ReduceOp, sync_ddp_if_available
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.seed import reset_seed

Expand Down Expand Up @@ -197,13 +197,6 @@ def setup_distributed(self):
# where to store ip_table
self.init_ddp_connection()

# on world_size=0 let everyone know training is starting
if self.is_global_zero and not torch.distributed.is_initialized():
log.info("-" * 100)
log.info(f"distributed_backend={self.distributed_backend}")
log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
log.info("-" * 100)

# set the ranks and devices
self.dist.rank = self.global_rank
self.dist.device = self.root_device
Expand Down Expand Up @@ -271,6 +264,14 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt
log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)

# on rank=0 let everyone know training is starting
rank_zero_info(
f"{'-' * 100}\n"
f"distributed_backend={self.torch_distributed_backend}\n"
f"All DDP processes registered. Starting ddp with {self.world_size} processes\n"
f"{'-' * 100}\n"
)

def pre_dispatch(self):
# move the model to the correct device
self.model_to_device()
Expand Down
17 changes: 9 additions & 8 deletions pytorch_lightning/plugins/training_type/ddp_spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7, _TORCH_GREATER_EQUAL_1_8, rank_zero_warn
from pytorch_lightning.utilities.cloud_io import atomic_save
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available
from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only, ReduceOp, sync_ddp_if_available
from pytorch_lightning.utilities.seed import reset_seed

if _TORCH_GREATER_EQUAL_1_8:
Expand Down Expand Up @@ -148,13 +148,6 @@ def new_process(self, process_idx, trainer, mp_queue):
# ... need to double check that it is the correct place
# self.trainer.call_setup_hook(self.model)

# on world_size=0 let everyone know training is starting
if self.is_global_zero and not torch.distributed.is_initialized():
log.info("-" * 100)
log.info(f"distributed_backend={self.distributed_backend}")
log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
log.info("-" * 100)

# set the ranks and devices
self.dist.rank = self.global_rank
self.dist.device = self.root_device
Expand Down Expand Up @@ -230,6 +223,14 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i
log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)

# on rank=0 let everyone know training is starting
rank_zero_info(
f"{'-' * 100}\n"
f"distributed_backend={self.torch_distributed_backend}\n"
f"All DDP processes registered. Starting ddp with {self.world_size} processes\n"
f"{'-' * 100}\n"
)

def determine_ddp_device_ids(self):
if self.root_device.type == "cpu":
return None
Expand Down

0 comments on commit 26c80c5

Please sign in to comment.