pytorch · felipemello1 · Jan 24, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 22, 2025
diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py
@@ -870,6 +870,7 @@ def train(self) -> None:
                     and curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
 
@@ -1019,6 +1020,7 @@ def train(self) -> None:
                         == self.profiler_wait_steps
                         + self.profiler_warmup_steps
                         + self.profiler_active_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -723,6 +723,7 @@ def train(self) -> None:
                     and curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
 
@@ -846,6 +847,7 @@ def train(self) -> None:
                         == self.profiler_wait_steps
                         + self.profiler_warmup_steps
                         + self.profiler_active_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -685,9 +685,9 @@ def train(self) -> None:
                     curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
-
                 utils.batch_to_device(batch, self._device)
 
                 # Calculate the number of unmasked tokens in the current batch
@@ -766,6 +766,7 @@ def train(self) -> None:
                     == self.profiler_wait_steps
                     + self.profiler_warmup_steps
                     + self.profiler_active_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
@@ -846,6 +846,7 @@ def train(self) -> None:
                     and curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
 

diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
@@ -702,6 +702,7 @@ def train(self) -> None:
                         curr_epoch == 0
                         and self.profiler_profile_memory
                         and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history()
 
@@ -784,6 +785,7 @@ def train(self) -> None:
                         == self.profiler_wait_steps
                         + self.profiler_warmup_steps
                         + self.profiler_active_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -776,6 +776,7 @@ def train(self) -> None:
                     and curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
 
@@ -880,6 +881,7 @@ def train(self) -> None:
                         == self.profiler_wait_steps
                         + self.profiler_warmup_steps
                         + self.profiler_active_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/lora_finetune_distributed_multi_dataset.py b/recipes/lora_finetune_distributed_multi_dataset.py
@@ -805,6 +805,7 @@ def train(self) -> None:
                     and curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
 
@@ -909,6 +910,7 @@ def train(self) -> None:
                         == self.profiler_wait_steps
                         + self.profiler_warmup_steps
                         + self.profiler_active_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -688,6 +688,7 @@ def train(self) -> None:
                         curr_epoch == 0
                         and self.profiler_profile_memory
                         and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history()
 
@@ -761,6 +762,7 @@ def train(self) -> None:
                         == self.profiler_wait_steps
                         + self.profiler_warmup_steps
                         + self.profiler_active_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
@@ -935,6 +935,7 @@ def train(self) -> None:
                     curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
 
@@ -1034,6 +1035,7 @@ def train(self) -> None:
                     == self.profiler_wait_steps
                     + self.profiler_warmup_steps
                     + self.profiler_active_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -773,6 +773,7 @@ def train(self) -> None:
                     and curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
 
@@ -913,6 +914,7 @@ def train(self) -> None:
                         == self.profiler_wait_steps
                         + self.profiler_warmup_steps
                         + self.profiler_active_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
@@ -820,6 +820,7 @@ def train(self) -> None:
                     and curr_epoch == 0
                     and self.profiler_profile_memory
                     and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                    and self._device.type == "cuda"
                 ):
                     torch.cuda.memory._record_memory_history()
 
@@ -924,6 +925,7 @@ def train(self) -> None:
                         == self.profiler_wait_steps
                         + self.profiler_warmup_steps
                         + self.profiler_active_steps
+                        and self._device.type == "cuda"
                     ):
                         torch.cuda.memory._record_memory_history(enabled=None)
 

diff --git a/tests/torchtune/training/test_profiler.py b/tests/torchtune/training/test_profiler.py
@@ -39,6 +39,7 @@ def profiler_cfg():
  enabled: True
  cpu: True
  cuda: True
+ xpu: True
  profile_memory: False
  with_stack: False
  record_shapes: True
@@ -92,6 +93,7 @@ def reference_profiler_basic():
         activities=[
             torch.profiler.ProfilerActivity.CPU,
             torch.profiler.ProfilerActivity.CUDA,
+            torch.profiler.ProfilerActivity.XPU,
         ],
         schedule=torch.profiler.schedule(wait=3, warmup=1, active=1, repeat=0),
         profile_memory=False,
@@ -107,6 +109,7 @@ def reference_profiler_full():
         activities=[
             torch.profiler.ProfilerActivity.CPU,
             torch.profiler.ProfilerActivity.CUDA,
+            torch.profiler.ProfilerActivity.XPU,
         ],
         schedule=torch.profiler.schedule(wait=3, warmup=1, active=1, repeat=0),
         profile_memory=True,
@@ -194,10 +197,12 @@ def test_default_activities(profiler_cfg):
     # Test setup automatically adds CPU + CUDA tracing if neither CPU nor CUDA is specified
     cfg.pop("cpu")
     cfg.pop("cuda")
+    cfg.pop("xpu")
     profiler, updated_cfg = _setup_profiler(cfg)
     assert profiler.activities == DEFAULT_PROFILER_ACTIVITIES
     assert updated_cfg.cpu is True
     assert updated_cfg.cuda is True
+    assert updated_cfg.xpu is True
 
 
 def test_default_output_dir(profiler_cfg):

diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py
@@ -27,6 +27,7 @@
 DEFAULT_PROFILER_ACTIVITIES = {
     torch.profiler.ProfilerActivity.CPU,
     torch.profiler.ProfilerActivity.CUDA,
+    torch.profiler.ProfilerActivity.XPU,
 }
 
 DEFAULT_SCHEDULE: dict = {
@@ -111,7 +112,7 @@ def trace_handler(
         log.info(f"Finished dumping traces in {time.monotonic() - begin:.2f} seconds")
 
     # Memory timeline sometimes fails to export
-    if prof.profile_memory:
+    if prof.profile_memory and torch.cuda.is_available():
         if rank == 0:
             try:
                 prof.export_memory_timeline(
@@ -185,6 +186,7 @@ def setup_torch_profiler(
     enabled: bool = False,
     cpu: bool = True,
     cuda: bool = True,
+    xpu: bool = True,
     profile_memory: bool = DEFAULT_TRACE_OPTS["profile_memory"],
     with_stack: bool = DEFAULT_TRACE_OPTS["with_stack"],
     record_shapes: bool = DEFAULT_TRACE_OPTS["record_shapes"],
@@ -252,6 +254,7 @@ def setup_torch_profiler(
         enabled (bool): Enable pytorch profiler. Default is False.
         cpu (bool): Enable cpu profiling. Default is True.
         cuda (bool): Enable cuda profiling. Default is True.
+        xpu (bool): Enable xpu profiling. Default is True.
         profile_memory (bool): Profile memory usage. Default is False.
         with_stack (bool): Profile stack. Default is False.
         record_shapes (bool): Record shapes. Default is True.
@@ -276,10 +279,12 @@ def setup_torch_profiler(
         activities.append(torch.profiler.ProfilerActivity.CPU)
     if cuda:
         activities.append(torch.profiler.ProfilerActivity.CUDA)
+    if xpu:
+        activities.append(torch.profiler.ProfilerActivity.XPU)
     if len(activities) == 0:
         _warn("No activities specified, defaulting to CPU + CUDA")
         activities = DEFAULT_PROFILER_ACTIVITIES
-        cpu = cuda = True
+        cpu = cuda = xpu = True
 
     # Check for schedule
     # 1) If no schedule is provided, set to DEFAULT_SCHEDULE
@@ -372,6 +377,7 @@ def setup_torch_profiler(
             "output_dir": output_dir,
             "cpu": cpu,
             "cuda": cuda,
+            "xpu": xpu,
             "profile_memory": profile_memory,
             "with_stack": with_stack,
             "record_shapes": record_shapes,