From 61fc9512cd91969331fd4a7b17431e6690407292 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 11:11:17 +0530 Subject: [PATCH 01/19] Support devices flag to Trainer --- .../connectors/accelerator_connector.py | 33 +++++++++++++++++-- pytorch_lightning/trainer/trainer.py | 8 +++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index aea853720cd8c..132ba13873f8b 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -85,6 +85,7 @@ class AcceleratorConnector(object): def __init__( self, num_processes, + devices, tpu_cores, ipus, distributed_backend, @@ -106,6 +107,7 @@ def __init__( self._accelerator_type = None self.num_processes = num_processes + self.devices = devices # `gpus` is the input passed to the Trainer, whereas `gpu_ids` is a list of parsed gpu ids. self.gpus = gpus self.parallel_device_ids = gpu_ids @@ -291,6 +293,8 @@ def cluster_environment(self) -> ClusterEnvironment: @property def has_cpu(self) -> bool: + if self.num_processes is not None: + self._map_devices_to_accelerator(DeviceType.CPU) return True @property @@ -302,7 +306,9 @@ def has_gpu(self) -> bool: # Here, we are not checking for GPU availability, but instead if User has passed # `gpus` to Trainer for training. gpus = self.parallel_device_ids - return gpus is not None and len(gpus) > 0 + if gpus is not None and len(gpus) > 0: + return True + return self._map_devices_to_accelerator(DeviceType.GPU) @property def use_gpu(self) -> bool: @@ -312,7 +318,9 @@ def use_gpu(self) -> bool: def has_tpu(self) -> bool: # Here, we are not checking for TPU availability, but instead if User has passed # `tpu_cores` to Trainer for training. - return self.tpu_cores is not None + if self.tpu_cores is not None: + return True + return self._map_devices_to_accelerator(DeviceType.TPU) @property def use_tpu(self) -> bool: @@ -328,12 +336,31 @@ def tpu_id(self) -> Optional[int]: def has_ipu(self) -> bool: # Here, we are not checking for IPU availability, but instead if User has passed # `ipus` to Trainer for training. - return self.ipus is not None or isinstance(self._training_type_plugin, IPUPlugin) + if self.ipus is not None or isinstance(self._training_type_plugin, IPUPlugin): + return True + return self._map_devices_to_accelerator(DeviceType.IPU) @property def use_ipu(self) -> bool: return self._accelerator_type == DeviceType.IPU and self.has_ipu + def _map_devices_to_accelerator(self, accelerator: str) -> bool: + if self.devices is not None: + return False + if accelerator == DeviceType.TPU and _TPU_AVAILABLE: + self.tpu_cores = device_parser.parse_tpu_cores(self.devices) + return True + elif accelerator == DeviceType.IPU and _IPU_AVAILABLE: + self.ipus = self.devices + return True + elif accelerator == DeviceType.GPU and torch.cuda.is_available(): + self.gpus = device_parser.parse_gpu_ids(self.devices) + return True + elif accelerator == DeviceType.CPU: + self.num_processes = self.devices + return True + return False + @property def use_dp(self) -> bool: return self._distrib_type == DistributedType.DP diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ac7a41e3808f2..e3708185432e0 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -112,6 +112,7 @@ def __init__( process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, + devices: Optional[Union[List[int], str, int]] = None, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, @@ -207,6 +208,9 @@ def __init__( deterministic: If true enables cudnn.deterministic. + devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`, + based on the accelerator type. + distributed_backend: deprecated. Please use 'accelerator' fast_dev_run: runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es) @@ -343,8 +347,8 @@ def __init__( self.optimizer_connector = OptimizerConnector(self) self.accelerator_connector = AcceleratorConnector( - num_processes, tpu_cores, ipus, distributed_backend, gpus, gpu_ids, num_nodes, sync_batchnorm, benchmark, - replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins + num_processes, devices, tpu_cores, ipus, distributed_backend, gpus, gpu_ids, num_nodes, sync_batchnorm, + benchmark, replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins ) self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) From 341c556a201fa83d298fd6df9724ecdc1538677e Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 11:55:02 +0530 Subject: [PATCH 02/19] Update logic for cpus --- .../trainer/connectors/accelerator_connector.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 132ba13873f8b..f7203a3ac89ab 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -181,6 +181,7 @@ def select_accelerator_type(self) -> None: elif self.has_gpu: self._accelerator_type = DeviceType.GPU else: + self._set_devices_to_cpu_num_processes() self._accelerator_type = DeviceType.CPU elif self.distributed_backend == DeviceType.TPU: if not self.has_tpu: @@ -198,6 +199,7 @@ def select_accelerator_type(self) -> None: raise MisconfigurationException(f"You passed `accelerator='gpu'`, but {msg}.") self._accelerator_type = DeviceType.GPU elif self.distributed_backend == DeviceType.CPU: + self._set_devices_to_cpu_num_processes() self._accelerator_type = DeviceType.CPU if self.distributed_backend in ["auto"] + list(DeviceType): @@ -293,8 +295,6 @@ def cluster_environment(self) -> ClusterEnvironment: @property def has_cpu(self) -> bool: - if self.num_processes is not None: - self._map_devices_to_accelerator(DeviceType.CPU) return True @property @@ -344,8 +344,12 @@ def has_ipu(self) -> bool: def use_ipu(self) -> bool: return self._accelerator_type == DeviceType.IPU and self.has_ipu + def _set_devices_to_cpu_num_processes(self) -> None: + if self.num_processes <= 1: + self._map_devices_to_accelerator(DeviceType.CPU) + def _map_devices_to_accelerator(self, accelerator: str) -> bool: - if self.devices is not None: + if self.devices is None: return False if accelerator == DeviceType.TPU and _TPU_AVAILABLE: self.tpu_cores = device_parser.parse_tpu_cores(self.devices) From fe0cb21a22f5879b446850e5548ec90e5d01dd52 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 12:09:53 +0530 Subject: [PATCH 03/19] Add tests for cpus --- .../test_accelerator_connector.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index e2f8ca0a4074d..66c618b4a6974 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -639,3 +639,23 @@ def test_accelerator_cpu_with_multiple_gpus(): assert trainer._device_type == "cpu" assert isinstance(trainer.accelerator, CPUAccelerator) + + +@pytest.mark.parametrize(["devices", "plugin"], [(1, SingleDevicePlugin), (5, DDPSpawnPlugin)]) +def test_accelerator_cpu_with_devices(devices, plugin): + + trainer = Trainer(accelerator="cpu", devices=devices) + + assert trainer.num_processes == devices + + assert isinstance(trainer.training_type_plugin, plugin) + assert isinstance(trainer.accelerator, CPUAccelerator) + + +def test_accelerator_cpu_with_num_processes_priority(): + """ Test for checking num_processes takes priority over devices. """ + + num_processes = 5 + trainer = Trainer(accelerator="cpu", devices=8, num_processes=num_processes) + + assert trainer.num_processes == num_processes From 1b5489334b5d5b58f0ddbc2b2a245179387c2284 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 12:24:50 +0530 Subject: [PATCH 04/19] Add tests for gpus --- .../connectors/accelerator_connector.py | 3 +- .../test_accelerator_connector.py | 31 +++++++++++++++++-- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index f7203a3ac89ab..6cfa67577a802 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -358,7 +358,8 @@ def _map_devices_to_accelerator(self, accelerator: str) -> bool: self.ipus = self.devices return True elif accelerator == DeviceType.GPU and torch.cuda.is_available(): - self.gpus = device_parser.parse_gpu_ids(self.devices) + self.gpus = self.devices + self.parallel_device_ids = device_parser.parse_gpu_ids(self.devices) return True elif accelerator == DeviceType.CPU: self.num_processes = self.devices diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 66c618b4a6974..9a8d94d090940 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -647,7 +647,6 @@ def test_accelerator_cpu_with_devices(devices, plugin): trainer = Trainer(accelerator="cpu", devices=devices) assert trainer.num_processes == devices - assert isinstance(trainer.training_type_plugin, plugin) assert isinstance(trainer.accelerator, CPUAccelerator) @@ -657,5 +656,33 @@ def test_accelerator_cpu_with_num_processes_priority(): num_processes = 5 trainer = Trainer(accelerator="cpu", devices=8, num_processes=num_processes) - assert trainer.num_processes == num_processes + + +@RunIf(min_gpus=2) +@pytest.mark.parametrize(["devices", "plugin"], [(1, SingleDevicePlugin), (2, DDPSpawnPlugin)]) +def test_accelerator_gpu_with_devices(devices, plugin): + + trainer = Trainer(accelerator="gpu", devices=devices) + + assert trainer.gpus == devices + assert isinstance(trainer.training_type_plugin, plugin) + assert isinstance(trainer.accelerator, GPUAccelerator) + + +@RunIf(min_gpus=1) +def test_accelerator_auto_with_devices_gpu(): + + trainer = Trainer(accelerator="auto", devices=1) + + assert trainer._device_type == "gpu" + assert trainer.gpus == 1 + + +@RunIf(min_gpus=1) +def test_accelerator_gpu_with_gpus_priority(): + """ Test for checking `gpus` flag takes priority over `devices`. """ + + gpus = 1 + trainer = Trainer(accelerator="gpu", devices=4, gpus=gpus) + assert trainer.gpus == gpus From 400df104983a2d72c1da27a47d0b000a17faee00 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 12:45:12 +0530 Subject: [PATCH 05/19] Add tests for tpus & ipus --- tests/accelerators/test_ipu.py | 28 ++++++++++++++++++++++++ tests/accelerators/test_tpu_backend.py | 30 ++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index e4e5ebe1b7827..022d1aae5add6 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -519,3 +519,31 @@ def test_accelerator_cpu_with_ipus_flag(): assert trainer._device_type == "cpu" assert isinstance(trainer.accelerator, CPUAccelerator) + + +@RunIf(ipu=True) +def test_accelerator_ipu_with_devices(): + + trainer = Trainer(accelerator="ipu", devices=8) + + assert trainer.ipus == 8 + assert isinstance(trainer.training_type_plugin, IPUPlugin) + assert isinstance(trainer.accelerator, IPUAccelerator) + + +@RunIf(ipu=True) +def test_accelerator_auto_with_devices_ipu(): + + trainer = Trainer(accelerator="auto", devices=8) + + assert trainer._device_type == "ipu" + assert trainer.ipus == 8 + + +@RunIf(ipu=True) +def test_accelerator_ipu_with_ipus_priority(): + """ Test for checking `ipus` flag takes priority over `devices`. """ + + ipus = 8 + trainer = Trainer(accelerator="ipu", devices=1, ipus=ipus) + assert trainer.ipus == ipus diff --git a/tests/accelerators/test_tpu_backend.py b/tests/accelerators/test_tpu_backend.py index 574f97deeafe6..48653d09874f2 100644 --- a/tests/accelerators/test_tpu_backend.py +++ b/tests/accelerators/test_tpu_backend.py @@ -18,6 +18,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.tpu import TPUAccelerator +from pytorch_lightning.plugins import SingleTPUPlugin, TPUSpawnPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf @@ -148,3 +149,32 @@ def test_accelerator_tpu_with_auto(): assert trainer._device_type == "tpu" assert isinstance(trainer.accelerator, TPUAccelerator) + + +@RunIf(tpu=True) +@pytest.mark.parametrize(["devices", "plugin"], [([0], SingleTPUPlugin), (8, TPUSpawnPlugin)]) +def test_accelerator_tpu_with_devices(devices, plugin): + + trainer = Trainer(accelerator="tpu", devices=devices) + + assert trainer.tpu_cores == devices + assert isinstance(trainer.training_type_plugin, plugin) + assert isinstance(trainer.accelerator, TPUAccelerator) + + +@RunIf(tpu=True) +def test_accelerator_auto_with_devices_tpu(): + + trainer = Trainer(accelerator="auto", devices=8) + + assert trainer._device_type == "tpu" + assert trainer.tpu_cores == 8 + + +@RunIf(tpu=True) +def test_accelerator_tpu_with_tpu_cores_priority(): + """ Test for checking `tpu_cores` flag takes priority over `devices`. """ + + tpu_cores = 8 + trainer = Trainer(accelerator="tpu", devices=1, tpu_cores=tpu_cores) + assert trainer.tpu_cores == tpu_cores From a4eeed783ece8de37bc468c473b5c50c91622d6b Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 12:47:00 +0530 Subject: [PATCH 06/19] Update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1783bb0f2a77e..37e28f3fc45da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -172,6 +172,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Enabled traditional/manual launching of DDP processes through `LOCAL_RANK` and `NODE_RANK` environment variable assignments ([#7480](https://github.com/PyTorchLightning/pytorch-lightning/pull/7480)) +- Added support for `devices` flag to Trainer ([#8440](https://github.com/PyTorchLightning/pytorch-lightning/pull/8440)) + + ### Changed From 752529484b39aaa623be742c4e65a1c6d6c0f1c2 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 13:14:20 +0530 Subject: [PATCH 07/19] Update test --- tests/accelerators/test_accelerator_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 9a8d94d090940..2e275f5a1696e 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -660,7 +660,8 @@ def test_accelerator_cpu_with_num_processes_priority(): @RunIf(min_gpus=2) -@pytest.mark.parametrize(["devices", "plugin"], [(1, SingleDevicePlugin), (2, DDPSpawnPlugin)]) +@pytest.mark.parametrize(["devices", "plugin"], [(1, SingleDevicePlugin), ([1], SingleDevicePlugin), + (2, DDPSpawnPlugin)]) def test_accelerator_gpu_with_devices(devices, plugin): trainer = Trainer(accelerator="gpu", devices=devices) From f89becafdba435b0a6903fa6790e4fe5fd09adb7 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 14:18:52 +0530 Subject: [PATCH 08/19] Update test --- tests/accelerators/test_tpu_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_tpu_backend.py b/tests/accelerators/test_tpu_backend.py index 48653d09874f2..9aaf342fe5749 100644 --- a/tests/accelerators/test_tpu_backend.py +++ b/tests/accelerators/test_tpu_backend.py @@ -152,7 +152,7 @@ def test_accelerator_tpu_with_auto(): @RunIf(tpu=True) -@pytest.mark.parametrize(["devices", "plugin"], [([0], SingleTPUPlugin), (8, TPUSpawnPlugin)]) +@pytest.mark.parametrize(["devices", "plugin"], [([1], SingleTPUPlugin), (8, TPUSpawnPlugin)]) def test_accelerator_tpu_with_devices(devices, plugin): trainer = Trainer(accelerator="tpu", devices=devices) From 87fe1361590c9ecdbed8807afa6804c961debb79 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Fri, 16 Jul 2021 18:06:14 +0530 Subject: [PATCH 09/19] Update tests/accelerators/test_accelerator_connector.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- tests/accelerators/test_accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 2e275f5a1696e..7573e06f9a764 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -661,7 +661,7 @@ def test_accelerator_cpu_with_num_processes_priority(): @RunIf(min_gpus=2) @pytest.mark.parametrize(["devices", "plugin"], [(1, SingleDevicePlugin), ([1], SingleDevicePlugin), - (2, DDPSpawnPlugin)]) + (2, DDPSpawnPlugin),]) def test_accelerator_gpu_with_devices(devices, plugin): trainer = Trainer(accelerator="gpu", devices=devices) From 1f82a3e673bf192c9f19f441ed8468b954046989 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Jul 2021 12:37:19 +0000 Subject: [PATCH 10/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/accelerators/test_accelerator_connector.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 7573e06f9a764..92282191d1c2e 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -660,8 +660,11 @@ def test_accelerator_cpu_with_num_processes_priority(): @RunIf(min_gpus=2) -@pytest.mark.parametrize(["devices", "plugin"], [(1, SingleDevicePlugin), ([1], SingleDevicePlugin), - (2, DDPSpawnPlugin),]) +@pytest.mark.parametrize(["devices", "plugin"], [ + (1, SingleDevicePlugin), + ([1], SingleDevicePlugin), + (2, DDPSpawnPlugin), +]) def test_accelerator_gpu_with_devices(devices, plugin): trainer = Trainer(accelerator="gpu", devices=devices) From e44cf7b1df27954e9053f791d6b2267da14573f6 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Fri, 16 Jul 2021 18:11:59 +0530 Subject: [PATCH 11/19] Update test --- tests/accelerators/test_tpu_backend.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/accelerators/test_tpu_backend.py b/tests/accelerators/test_tpu_backend.py index 9aaf342fe5749..035edc07a729c 100644 --- a/tests/accelerators/test_tpu_backend.py +++ b/tests/accelerators/test_tpu_backend.py @@ -18,7 +18,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.tpu import TPUAccelerator -from pytorch_lightning.plugins import SingleTPUPlugin, TPUSpawnPlugin +from pytorch_lightning.plugins import TPUSpawnPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf @@ -152,13 +152,12 @@ def test_accelerator_tpu_with_auto(): @RunIf(tpu=True) -@pytest.mark.parametrize(["devices", "plugin"], [([1], SingleTPUPlugin), (8, TPUSpawnPlugin)]) -def test_accelerator_tpu_with_devices(devices, plugin): +def test_accelerator_tpu_with_devices(): - trainer = Trainer(accelerator="tpu", devices=devices) + trainer = Trainer(accelerator="tpu", devices=8) - assert trainer.tpu_cores == devices - assert isinstance(trainer.training_type_plugin, plugin) + assert trainer.tpu_cores == 8 + assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin) assert isinstance(trainer.accelerator, TPUAccelerator) From 0927d7fb818540fbd98ddb061ef87f55b130eaef Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Sat, 17 Jul 2021 21:24:48 +0530 Subject: [PATCH 12/19] Add set devices if none --- .../connectors/accelerator_connector.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 6cfa67577a802..a08500e2a0c37 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -138,6 +138,7 @@ def __init__( self.plugins = plugins + self.validate_accelerator_and_devices() self.select_accelerator_type() self.set_distributed_mode() self.configure_slurm_ddp() @@ -145,6 +146,7 @@ def __init__( self.handle_given_plugins() self.update_device_type_if_ipu_plugin() self.validate_accelerator_type() + self.set_devices_if_none() self._training_type_plugin_resolved = False self.accelerator = self.select_accelerator() @@ -205,6 +207,14 @@ def select_accelerator_type(self) -> None: if self.distributed_backend in ["auto"] + list(DeviceType): self.distributed_backend = None + def validate_accelerator_and_devices(self) -> None: + if self.distributed_backend not in ["auto"] + list(DeviceType) and self.devices is not None: + raise MisconfigurationException( + f"You passed `devices={self.devices}` but haven't specified" + " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu') for the devices mapping" + f" but passed `accelerator={self.distributed_backend}`" + ) + def validate_accelerator_type(self) -> None: if self._accelerator_type and self._accelerator_type != self._device_type: raise MisconfigurationException( @@ -213,6 +223,18 @@ def validate_accelerator_type(self) -> None: ) self._accelerator_type = self._device_type + def set_devices_if_none(self) -> None: + if self.devices is not None: + return + if self._accelerator_type == DeviceType.TPU: + self.devices = self.tpu_cores + elif self._accelerator_type == DeviceType.IPU: + self.devices = self.ipus + elif self._accelerator_type == DeviceType.GPU: + self.devices = self.gpus + elif self._accelerator_type == DeviceType.CPU: + self.devices = self.num_processes + def handle_given_plugins(self) -> None: training_type = None From 1c5dbad738bb55f90a43a1486a317d6fa8572d4c Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Sun, 18 Jul 2021 09:32:25 +0530 Subject: [PATCH 13/19] Add tests --- .../connectors/accelerator_connector.py | 2 +- pytorch_lightning/trainer/properties.py | 4 ++++ .../test_accelerator_connector.py | 19 +++++++++++++++++++ tests/accelerators/test_ipu.py | 7 +++++++ tests/accelerators/test_tpu_backend.py | 7 +++++++ 5 files changed, 38 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index a08500e2a0c37..9d810c14f822d 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -211,7 +211,7 @@ def validate_accelerator_and_devices(self) -> None: if self.distributed_backend not in ["auto"] + list(DeviceType) and self.devices is not None: raise MisconfigurationException( f"You passed `devices={self.devices}` but haven't specified" - " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu') for the devices mapping" + " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" f" but passed `accelerator={self.distributed_backend}`" ) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 54d0079b9255e..17e6bec334740 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -144,6 +144,10 @@ def ipus(self) -> int: def num_gpus(self) -> int: return self.accelerator_connector.num_gpus + @property + def devices(self) -> Optional[Union[List[int], str, int]]: + return self.accelerator_connector.devices + @property def data_parallel_device_ids(self) -> Optional[List[int]]: return self.accelerator_connector.parallel_device_ids diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 92282191d1c2e..44b95831d9fee 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -690,3 +690,22 @@ def test_accelerator_gpu_with_gpus_priority(): gpus = 1 trainer = Trainer(accelerator="gpu", devices=4, gpus=gpus) assert trainer.gpus == gpus + + +def test_validate_accelerator_and_devices(): + + with pytest.raises(MisconfigurationException, match="You passed `devices=2` but haven't specified"): + Trainer(accelerator="ddp_cpu", devices=2) + + +def test_set_devices_if_none_cpu(): + + trainer = Trainer(accelerator="cpu", num_processes=3) + assert trainer.devices == 3 + + +@RunIf(min_gpus=2) +def test_set_devices_if_none_gpu(): + + trainer = Trainer(accelerator="gpu", gpus=2) + assert trainer.devices == 2 diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 022d1aae5add6..a57247180930e 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -547,3 +547,10 @@ def test_accelerator_ipu_with_ipus_priority(): ipus = 8 trainer = Trainer(accelerator="ipu", devices=1, ipus=ipus) assert trainer.ipus == ipus + + +@RunIf(ipu=True) +def test_set_devices_if_none_ipu(): + + trainer = Trainer(accelerator="ipu", ipus=8) + assert trainer.devices == 8 diff --git a/tests/accelerators/test_tpu_backend.py b/tests/accelerators/test_tpu_backend.py index 035edc07a729c..40ec30c0c30a3 100644 --- a/tests/accelerators/test_tpu_backend.py +++ b/tests/accelerators/test_tpu_backend.py @@ -177,3 +177,10 @@ def test_accelerator_tpu_with_tpu_cores_priority(): tpu_cores = 8 trainer = Trainer(accelerator="tpu", devices=1, tpu_cores=tpu_cores) assert trainer.tpu_cores == tpu_cores + + +@RunIf(tpu=True) +def test_set_devices_if_none_tpu(): + + trainer = Trainer(accelerator="tpu", tpu_cores=8) + assert trainer.devices == 8 From b579377fd3425c3ee98b6b56e0ccbece9178318c Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Sun, 18 Jul 2021 10:11:04 +0530 Subject: [PATCH 14/19] Warn if devices flag ignored --- .../connectors/accelerator_connector.py | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 9d810c14f822d..90336da5d2760 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -139,6 +139,8 @@ def __init__( self.plugins = plugins self.validate_accelerator_and_devices() + self.warn_if_devices_flag_ignored() + self.select_accelerator_type() self.set_distributed_mode() self.configure_slurm_ddp() @@ -223,6 +225,32 @@ def validate_accelerator_type(self) -> None: ) self._accelerator_type = self._device_type + def warn_if_devices_flag_ignored(self) -> None: + if self.devices is None: + return + devices_warning = f"`devices={self.devices}` will be ignored, as you have set" + if self.distributed_backend == "auto": + if self.tpu_cores is not None: + rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`") + elif self.ipus is not None: + rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`") + elif self.gpus is not None: + rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") + elif self.num_processes == 1: + rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") + elif self.distributed_backend == DeviceType.TPU: + if self.tpu_cores is not None: + rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`") + elif self.distributed_backend == DeviceType.IPU: + if self.ipus is not None: + rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`") + elif self.distributed_backend == DeviceType.GPU: + if self.gpus is not None: + rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") + elif self.distributed_backend == DeviceType.CPU: + if self.num_processes == 1: + rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") + def set_devices_if_none(self) -> None: if self.devices is not None: return @@ -367,7 +395,7 @@ def use_ipu(self) -> bool: return self._accelerator_type == DeviceType.IPU and self.has_ipu def _set_devices_to_cpu_num_processes(self) -> None: - if self.num_processes <= 1: + if self.num_processes == 1: self._map_devices_to_accelerator(DeviceType.CPU) def _map_devices_to_accelerator(self, accelerator: str) -> bool: From 669095c7aa3eee6a595598eda046d70361f4d916 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Mon, 19 Jul 2021 19:59:19 +0530 Subject: [PATCH 15/19] Make certain methods protected --- .../trainer/connectors/accelerator_connector.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 90336da5d2760..c889c97e60e6e 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -138,8 +138,8 @@ def __init__( self.plugins = plugins - self.validate_accelerator_and_devices() - self.warn_if_devices_flag_ignored() + self._validate_accelerator_and_devices() + self._warn_if_devices_flag_ignored() self.select_accelerator_type() self.set_distributed_mode() @@ -147,8 +147,9 @@ def __init__( self.handle_given_plugins() self.update_device_type_if_ipu_plugin() - self.validate_accelerator_type() - self.set_devices_if_none() + + self._validate_accelerator_type() + self._set_devices_if_none() self._training_type_plugin_resolved = False self.accelerator = self.select_accelerator() @@ -209,7 +210,7 @@ def select_accelerator_type(self) -> None: if self.distributed_backend in ["auto"] + list(DeviceType): self.distributed_backend = None - def validate_accelerator_and_devices(self) -> None: + def _validate_accelerator_and_devices(self) -> None: if self.distributed_backend not in ["auto"] + list(DeviceType) and self.devices is not None: raise MisconfigurationException( f"You passed `devices={self.devices}` but haven't specified" @@ -217,7 +218,7 @@ def validate_accelerator_and_devices(self) -> None: f" but passed `accelerator={self.distributed_backend}`" ) - def validate_accelerator_type(self) -> None: + def _validate_accelerator_type(self) -> None: if self._accelerator_type and self._accelerator_type != self._device_type: raise MisconfigurationException( f"Mismatch between the requested accelerator type ({self._accelerator_type})" @@ -225,7 +226,7 @@ def validate_accelerator_type(self) -> None: ) self._accelerator_type = self._device_type - def warn_if_devices_flag_ignored(self) -> None: + def _warn_if_devices_flag_ignored(self) -> None: if self.devices is None: return devices_warning = f"`devices={self.devices}` will be ignored, as you have set" @@ -251,7 +252,7 @@ def warn_if_devices_flag_ignored(self) -> None: if self.num_processes == 1: rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") - def set_devices_if_none(self) -> None: + def _set_devices_if_none(self) -> None: if self.devices is not None: return if self._accelerator_type == DeviceType.TPU: From 408846fd543928d7d5c874e37fc1e895c83c141d Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Mon, 19 Jul 2021 20:01:30 +0530 Subject: [PATCH 16/19] Update pytorch_lightning/trainer/connectors/accelerator_connector.py Co-authored-by: Jirka Borovec --- .../trainer/connectors/accelerator_connector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index c889c97e60e6e..c46a5e92381c9 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -405,14 +405,14 @@ def _map_devices_to_accelerator(self, accelerator: str) -> bool: if accelerator == DeviceType.TPU and _TPU_AVAILABLE: self.tpu_cores = device_parser.parse_tpu_cores(self.devices) return True - elif accelerator == DeviceType.IPU and _IPU_AVAILABLE: + if accelerator == DeviceType.IPU and _IPU_AVAILABLE: self.ipus = self.devices return True - elif accelerator == DeviceType.GPU and torch.cuda.is_available(): + if accelerator == DeviceType.GPU and torch.cuda.is_available(): self.gpus = self.devices self.parallel_device_ids = device_parser.parse_gpu_ids(self.devices) return True - elif accelerator == DeviceType.CPU: + if accelerator == DeviceType.CPU: self.num_processes = self.devices return True return False From 188f86289c926fce77ef418c8ec0595ebb48200b Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Mon, 19 Jul 2021 20:06:27 +0530 Subject: [PATCH 17/19] Update pytorch_lightning/trainer/connectors/accelerator_connector.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index c46a5e92381c9..24cb020202472 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -214,8 +214,8 @@ def _validate_accelerator_and_devices(self) -> None: if self.distributed_backend not in ["auto"] + list(DeviceType) and self.devices is not None: raise MisconfigurationException( f"You passed `devices={self.devices}` but haven't specified" - " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping" - f" but passed `accelerator={self.distributed_backend}`" + " `accelerator=('auto'|'tpu'|'gpu'|'ipu'|'cpu')` for the devices mapping," + f" got `accelerator={self.distributed_backend}`." ) def _validate_accelerator_type(self) -> None: From 56578819e9f18df48fca4a6370e7423cda69e8ac Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Mon, 19 Jul 2021 23:16:57 +0530 Subject: [PATCH 18/19] Update tests --- .../trainer/connectors/accelerator_connector.py | 6 +++--- tests/accelerators/test_accelerator_connector.py | 8 ++++++-- tests/accelerators/test_ipu.py | 4 +++- tests/accelerators/test_tpu_backend.py | 4 +++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index c889c97e60e6e..c4ff0974d306a 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -229,7 +229,7 @@ def _validate_accelerator_type(self) -> None: def _warn_if_devices_flag_ignored(self) -> None: if self.devices is None: return - devices_warning = f"`devices={self.devices}` will be ignored, as you have set" + devices_warning = f"The flag `devices={self.devices}` will be ignored, as you have set" if self.distributed_backend == "auto": if self.tpu_cores is not None: rank_zero_warn(f"{devices_warning} `tpu_cores={self.tpu_cores}`") @@ -237,7 +237,7 @@ def _warn_if_devices_flag_ignored(self) -> None: rank_zero_warn(f"{devices_warning} `ipus={self.ipus}`") elif self.gpus is not None: rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") - elif self.num_processes == 1: + elif self.num_processes != 1: rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") elif self.distributed_backend == DeviceType.TPU: if self.tpu_cores is not None: @@ -249,7 +249,7 @@ def _warn_if_devices_flag_ignored(self) -> None: if self.gpus is not None: rank_zero_warn(f"{devices_warning} `gpus={self.gpus}`") elif self.distributed_backend == DeviceType.CPU: - if self.num_processes == 1: + if self.num_processes != 1: rank_zero_warn(f"{devices_warning} `num_processes={self.num_processes}`") def _set_devices_if_none(self) -> None: diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 44b95831d9fee..183c066c1283e 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -655,7 +655,9 @@ def test_accelerator_cpu_with_num_processes_priority(): """ Test for checking num_processes takes priority over devices. """ num_processes = 5 - trainer = Trainer(accelerator="cpu", devices=8, num_processes=num_processes) + with pytest.warns(UserWarning, match="The flag `devices=8` will be ignored,"): + trainer = Trainer(accelerator="cpu", devices=8, num_processes=num_processes) + assert trainer.num_processes == num_processes @@ -688,7 +690,9 @@ def test_accelerator_gpu_with_gpus_priority(): """ Test for checking `gpus` flag takes priority over `devices`. """ gpus = 1 - trainer = Trainer(accelerator="gpu", devices=4, gpus=gpus) + with pytest.warns(UserWarning, match="The flag `devices=4` will be ignored,"): + trainer = Trainer(accelerator="gpu", devices=4, gpus=gpus) + assert trainer.gpus == gpus diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index a57247180930e..2802c4dfc4659 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -545,7 +545,9 @@ def test_accelerator_ipu_with_ipus_priority(): """ Test for checking `ipus` flag takes priority over `devices`. """ ipus = 8 - trainer = Trainer(accelerator="ipu", devices=1, ipus=ipus) + with pytest.warns(UserWarning, match="The flag `devices=1` will be ignored,"): + trainer = Trainer(accelerator="ipu", devices=1, ipus=ipus) + assert trainer.ipus == ipus diff --git a/tests/accelerators/test_tpu_backend.py b/tests/accelerators/test_tpu_backend.py index 40ec30c0c30a3..175344f897803 100644 --- a/tests/accelerators/test_tpu_backend.py +++ b/tests/accelerators/test_tpu_backend.py @@ -175,7 +175,9 @@ def test_accelerator_tpu_with_tpu_cores_priority(): """ Test for checking `tpu_cores` flag takes priority over `devices`. """ tpu_cores = 8 - trainer = Trainer(accelerator="tpu", devices=1, tpu_cores=tpu_cores) + with pytest.warns(UserWarning, match="The flag `devices=1` will be ignored,"): + trainer = Trainer(accelerator="tpu", devices=1, tpu_cores=tpu_cores) + assert trainer.tpu_cores == tpu_cores From 25456e05a308e1b4040498b827e03d0d280dbd07 Mon Sep 17 00:00:00 2001 From: Kaushik B Date: Tue, 20 Jul 2021 09:21:35 +0530 Subject: [PATCH 19/19] Raise error for devices for cpu not being int --- .../trainer/connectors/accelerator_connector.py | 5 +++++ tests/accelerators/test_accelerator_connector.py | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 948ad3ee06576..6250857bf5b7a 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -413,6 +413,11 @@ def _map_devices_to_accelerator(self, accelerator: str) -> bool: self.parallel_device_ids = device_parser.parse_gpu_ids(self.devices) return True if accelerator == DeviceType.CPU: + if not isinstance(self.devices, int): + raise MisconfigurationException( + "The flag `devices` only supports integer for `accelerator='cpu'`," + f" got `devices={self.devices}` instead." + ) self.num_processes = self.devices return True return False diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 183c066c1283e..164aa6db12d7d 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -713,3 +713,9 @@ def test_set_devices_if_none_gpu(): trainer = Trainer(accelerator="gpu", gpus=2) assert trainer.devices == 2 + + +def test_devices_with_cpu_only_supports_integer(): + + with pytest.raises(MisconfigurationException, match="The flag `devices` only supports integer"): + Trainer(accelerator="cpu", devices="1,3")