Utilizing subprocess for nnUNet training. (#7576)

Workaround for #7575 ### Description - Due to the impact of #7575, the operation to set the device within nnUNetV2Runner will become ineffective. This PR is intended to resolve this issue. - Add a version check for #7575, will revisit after the update from pytorch team. ### Types of changes  - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. --------- Signed-off-by: YunLiu <[email protected]>
Project-MONAI · Apr 1, 2024 · 15d2abf · 15d2abf
1 parent 2d463a7
commit 15d2abf
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 52 deletions.
diff --git a/monai/apps/nnunet/nnunetv2_runner.py b/monai/apps/nnunet/nnunetv2_runner.py
@@ -22,6 +22,7 @@
 from monai.apps.nnunet.utils import analyze_data, create_new_data_copy, create_new_dataset_json
 from monai.bundle import ConfigParser
 from monai.utils import ensure_tuple, optional_import
+from monai.utils.misc import run_cmd
 
 load_pickle, _ = optional_import("batchgenerators.utilities.file_and_folder_operations", name="load_pickle")
 join, _ = optional_import("batchgenerators.utilities.file_and_folder_operations", name="join")
@@ -495,65 +496,64 @@ def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int
             fold: fold of the 5-fold cross-validation. Should be an int between 0 and 4.
             gpu_id: an integer to select the device to use, or a tuple/list of GPU device indices used for multi-GPU
                 training (e.g., (0,1)). Default: 0.
-        from nnunetv2.run.run_training import run_training
             kwargs: this optional parameter allows you to specify additional arguments in
-                ``nnunetv2.run.run_training.run_training``. Currently supported args are
-                    - plans_identifier: custom plans identifier. Default: "nnUNetPlans".
-                    - pretrained_weights: path to nnU-Net checkpoint file to be used as pretrained model. Will only be
-                        used when actually training. Beta. Use with caution. Default: False.
-                    - use_compressed_data: True to use compressed data for training. Reading compressed data is much
-                        more CPU and (potentially) RAM intensive and should only be used if you know what you are
-                        doing. Default: False.
-                    - continue_training: continue training from latest checkpoint. Default: False.
-                    - only_run_validation: True to run the validation only. Requires training to have finished.
-                        Default: False.
-                    - disable_checkpointing: True to disable checkpointing. Ideal for testing things out and you
-                        don't want to flood your hard drive with checkpoints. Default: False.
+                ``nnunetv2.run.run_training.run_training_entry``.
+
+                Currently supported args are:
+
+                - p: custom plans identifier. Default: "nnUNetPlans".
+                - pretrained_weights: path to nnU-Net checkpoint file to be used as pretrained model. Will only be
+                    used when actually training. Beta. Use with caution. Default: False.
+                - use_compressed: True to use compressed data for training. Reading compressed data is much
+                    more CPU and (potentially) RAM intensive and should only be used if you know what you are
+                    doing. Default: False.
+                - c: continue training from latest checkpoint. Default: False.
+                - val: True to run the validation only. Requires training to have finished.
+                    Default: False.
+                - disable_checkpointing: True to disable checkpointing. Ideal for testing things out and you
+                    don't want to flood your hard drive with checkpoints. Default: False.
         """
         if "num_gpus" in kwargs:
             kwargs.pop("num_gpus")
             logger.warning("please use gpu_id to set the GPUs to use")
 
-        if "trainer_class_name" in kwargs:
-            kwargs.pop("trainer_class_name")
+        if "tr" in kwargs:
+            kwargs.pop("tr")
             logger.warning("please specify the `trainer_class_name` in the __init__ of `nnUNetV2Runner`.")
 
-        if "export_validation_probabilities" in kwargs:
-            kwargs.pop("export_validation_probabilities")
+        if "npz" in kwargs:
+            kwargs.pop("npz")
             logger.warning("please specify the `export_validation_probabilities` in the __init__ of `nnUNetV2Runner`.")
 
+        cmd = self.train_single_model_command(config, fold, gpu_id, kwargs)
+        run_cmd(cmd, shell=True)
+
+    def train_single_model_command(self, config, fold, gpu_id, kwargs):
         if isinstance(gpu_id, (tuple, list)):
             if len(gpu_id) > 1:
                 gpu_ids_str = ""
                 for _i in range(len(gpu_id)):
                     gpu_ids_str += f"{gpu_id[_i]},"
-                os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids_str[:-1]
+                device_setting = f"CUDA_VISIBLE_DEVICES={gpu_ids_str[:-1]}"
             else:
-                os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id[0]}"
-        else:
-            os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
-
-        from nnunetv2.run.run_training import run_training
-
-        if isinstance(gpu_id, int) or len(gpu_id) == 1:
-            run_training(
-                dataset_name_or_id=self.dataset_name_or_id,
-                configuration=config,
-                fold=fold,
-                trainer_class_name=self.trainer_class_name,
-                export_validation_probabilities=self.export_validation_probabilities,
-                **kwargs,
-            )
+                device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id[0]}"
         else:
-            run_training(
-                dataset_name_or_id=self.dataset_name_or_id,
-                configuration=config,
-                fold=fold,
-                num_gpus=len(gpu_id),
-                trainer_class_name=self.trainer_class_name,
-                export_validation_probabilities=self.export_validation_probabilities,
-                **kwargs,
-            )
+            device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}"
+        num_gpus = 1 if isinstance(gpu_id, int) or len(gpu_id) == 1 else len(gpu_id)
+
+        cmd = (
+            f"{device_setting} nnUNetv2_train "
+            + f"{self.dataset_name_or_id} {config} {fold} "
+            + f"-tr {self.trainer_class_name} -num_gpus {num_gpus}"
+        )
+        if self.export_validation_probabilities:
+            cmd += " --npz"
+        for _key, _value in kwargs.items():
+            if _key == "p" or _key == "pretrained_weights":
+                cmd += f" -{_key} {_value}"
+            else:
+                cmd += f" --{_key} {_value}"
+        return cmd
 
     def train(
         self,
@@ -637,15 +637,7 @@ def train_parallel_cmd(
                 if _config in ensure_tuple(configs):
                     for _i in range(self.num_folds):
                         the_device = gpu_id_for_all[_index % n_devices]  # type: ignore
-                        cmd = (
-                            "python -m monai.apps.nnunet nnUNetV2Runner train_single_model "
-                            + f"--input_config '{self.input_config_or_dict}' --work_dir '{self.work_dir}' "
-                            + f"--config '{_config}' --fold {_i} --gpu_id {the_device} "
-                            + f"--trainer_class_name {self.trainer_class_name} "
-                            + f"--export_validation_probabilities {self.export_validation_probabilities}"
-                        )
-                        for _key, _value in kwargs.items():
-                            cmd += f" --{_key} {_value}"
+                        cmd = self.train_single_model_command(_config, _i, the_device, kwargs)
                         all_cmds[-1][the_device].append(cmd)
                         _index += 1
         return all_cmds

diff --git a/tests/test_set_visible_devices.py b/tests/test_set_visible_devices.py
@@ -14,7 +14,7 @@
 import os
 import unittest
 
-from tests.utils import skip_if_no_cuda
+from tests.utils import SkipIfAtLeastPyTorchVersion, skip_if_no_cuda
 
 
 class TestVisibleDevices(unittest.TestCase):
@@ -25,6 +25,7 @@ def run_process_and_get_exit_code(code_to_execute):
         return int(bin(value).replace("0b", "").rjust(16, "0")[:8], 2)
 
     @skip_if_no_cuda
+    @SkipIfAtLeastPyTorchVersion((2, 2, 1))
     def test_visible_devices(self):
         num_gpus_before = self.run_process_and_get_exit_code(
             'python -c "import os; import torch; '