Skip to content

Commit

Permalink
TPU Spawn Rank & root device Error (#7074)
Browse files Browse the repository at this point in the history
* TPU Spawn Rank Error

* Update tpu spawn

* Fix root device property for tpu spawn

* Update changelog
  • Loading branch information
kaushikb11 authored Apr 18, 2021
1 parent 71b4611 commit 30b7440
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed process rank not being available right away after `Trainer` instantiation ([#6941](https://github.com/PyTorchLightning/pytorch-lightning/pull/6941))


- Fixed the order to call for world ranks & the `root_device` property in `TPUSpawnPlugin` ([#7074](https://github.com/PyTorchLightning/pytorch-lightning/pull/7074))


## [1.2.7] - 2021-04-06

### Fixed
Expand Down
17 changes: 10 additions & 7 deletions pytorch_lightning/plugins/training_type/tpu_spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ def local_rank(self) -> int:
def world_size(self) -> int:
return self.num_processes

@property
def root_device(self) -> torch.device:
return self.device

@staticmethod
def _validate_dataloader(dataloaders: Union[List['DataLoader'], 'DataLoader']):
if not isinstance(dataloaders, list):
Expand Down Expand Up @@ -116,9 +120,7 @@ def is_distributed(self):

def process_dataloader(self, dataloader: 'DataLoader') -> MpDeviceLoader:
TPUSpawnPlugin._validate_dataloader(dataloader)
device = xm.xla_device()
dataloader = MpDeviceLoader(dataloader, device)
return dataloader
return MpDeviceLoader(dataloader, self.device)

def configure_ddp(self) -> None:
pass
Expand All @@ -127,8 +129,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int) -> None:
pass

def set_world_ranks(self, process_idx: int = 0) -> None:
self.tpu_local_core_rank = xm.get_local_ordinal()
self.tpu_global_core_rank = xm.get_ordinal()
pass

def new_process(self, process_idx: int, trainer, mp_queue) -> None:
self.mp_queue = mp_queue
Expand All @@ -137,7 +138,8 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
if seed is not None:
seed_everything(int(seed))

self.set_world_ranks()
self.tpu_local_core_rank = xm.get_local_ordinal()
self.tpu_global_core_rank = xm.get_ordinal()

# set warning rank
rank_zero_only.rank = self.global_rank
Expand All @@ -163,7 +165,8 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
time.sleep(2)

def model_to_device(self) -> None:
self._model.to(xm.xla_device())
self.device = xm.xla_device()
self.model.to(self.device)

def barrier(self, name: Optional[str] = None) -> None:
rendezvous(name)
Expand Down

0 comments on commit 30b7440

Please sign in to comment.