Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

profiling ops on xpu #2249

Merged
merged 4 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions recipes/dev/early_exit_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,7 @@ def train(self) -> None:
and curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -1019,6 +1020,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
2 changes: 2 additions & 0 deletions recipes/full_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,7 @@ def train(self) -> None:
and curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -846,6 +847,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
3 changes: 2 additions & 1 deletion recipes/full_finetune_single_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,9 +685,9 @@ def train(self) -> None:
curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

utils.batch_to_device(batch, self._device)

# Calculate the number of unmasked tokens in the current batch
Expand Down Expand Up @@ -766,6 +766,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
1 change: 1 addition & 0 deletions recipes/knowledge_distillation_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,7 @@ def train(self) -> None:
and curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down
2 changes: 2 additions & 0 deletions recipes/knowledge_distillation_single_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,7 @@ def train(self) -> None:
curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -784,6 +785,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
2 changes: 2 additions & 0 deletions recipes/lora_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,7 @@ def train(self) -> None:
and curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -880,6 +881,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
2 changes: 2 additions & 0 deletions recipes/lora_finetune_distributed_multi_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,7 @@ def train(self) -> None:
and curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -909,6 +910,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
2 changes: 2 additions & 0 deletions recipes/lora_finetune_single_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,7 @@ def train(self) -> None:
curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -761,6 +762,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
2 changes: 2 additions & 0 deletions recipes/ppo_full_finetune_single_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,7 @@ def train(self) -> None:
curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -1034,6 +1035,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
2 changes: 2 additions & 0 deletions recipes/qat_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,7 @@ def train(self) -> None:
and curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -913,6 +914,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
2 changes: 2 additions & 0 deletions recipes/qat_lora_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,7 @@ def train(self) -> None:
and curr_epoch == 0
and self.profiler_profile_memory
and idx == self.profiler_wait_steps + self.profiler_warmup_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history()

Expand Down Expand Up @@ -924,6 +925,7 @@ def train(self) -> None:
== self.profiler_wait_steps
+ self.profiler_warmup_steps
+ self.profiler_active_steps
and self._device.type == "cuda"
):
torch.cuda.memory._record_memory_history(enabled=None)

Expand Down
5 changes: 5 additions & 0 deletions tests/torchtune/training/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def profiler_cfg():
enabled: True
cpu: True
cuda: True
xpu: True
profile_memory: False
with_stack: False
record_shapes: True
Expand Down Expand Up @@ -92,6 +93,7 @@ def reference_profiler_basic():
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
torch.profiler.ProfilerActivity.XPU,
],
schedule=torch.profiler.schedule(wait=3, warmup=1, active=1, repeat=0),
profile_memory=False,
Expand All @@ -107,6 +109,7 @@ def reference_profiler_full():
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
torch.profiler.ProfilerActivity.XPU,
],
schedule=torch.profiler.schedule(wait=3, warmup=1, active=1, repeat=0),
profile_memory=True,
Expand Down Expand Up @@ -194,10 +197,12 @@ def test_default_activities(profiler_cfg):
# Test setup automatically adds CPU + CUDA tracing if neither CPU nor CUDA is specified
cfg.pop("cpu")
cfg.pop("cuda")
cfg.pop("xpu")
profiler, updated_cfg = _setup_profiler(cfg)
assert profiler.activities == DEFAULT_PROFILER_ACTIVITIES
assert updated_cfg.cpu is True
assert updated_cfg.cuda is True
assert updated_cfg.xpu is True


def test_default_output_dir(profiler_cfg):
Expand Down
10 changes: 8 additions & 2 deletions torchtune/training/_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
DEFAULT_PROFILER_ACTIVITIES = {
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
torch.profiler.ProfilerActivity.XPU,
}

DEFAULT_SCHEDULE: dict = {
Expand Down Expand Up @@ -111,7 +112,7 @@ def trace_handler(
log.info(f"Finished dumping traces in {time.monotonic() - begin:.2f} seconds")

# Memory timeline sometimes fails to export
if prof.profile_memory:
if prof.profile_memory and torch.cuda.is_available():
if rank == 0:
try:
prof.export_memory_timeline(
Expand Down Expand Up @@ -185,6 +186,7 @@ def setup_torch_profiler(
enabled: bool = False,
cpu: bool = True,
cuda: bool = True,
xpu: bool = True,
profile_memory: bool = DEFAULT_TRACE_OPTS["profile_memory"],
with_stack: bool = DEFAULT_TRACE_OPTS["with_stack"],
record_shapes: bool = DEFAULT_TRACE_OPTS["record_shapes"],
Expand Down Expand Up @@ -252,6 +254,7 @@ def setup_torch_profiler(
enabled (bool): Enable pytorch profiler. Default is False.
cpu (bool): Enable cpu profiling. Default is True.
cuda (bool): Enable cuda profiling. Default is True.
xpu (bool): Enable xpu profiling. Default is True.
profile_memory (bool): Profile memory usage. Default is False.
with_stack (bool): Profile stack. Default is False.
record_shapes (bool): Record shapes. Default is True.
Expand All @@ -276,10 +279,12 @@ def setup_torch_profiler(
activities.append(torch.profiler.ProfilerActivity.CPU)
if cuda:
activities.append(torch.profiler.ProfilerActivity.CUDA)
if xpu:
activities.append(torch.profiler.ProfilerActivity.XPU)
if len(activities) == 0:
_warn("No activities specified, defaulting to CPU + CUDA")
activities = DEFAULT_PROFILER_ACTIVITIES
cpu = cuda = True
cpu = cuda = xpu = True

# Check for schedule
# 1) If no schedule is provided, set to DEFAULT_SCHEDULE
Expand Down Expand Up @@ -372,6 +377,7 @@ def setup_torch_profiler(
"output_dir": output_dir,
"cpu": cpu,
"cuda": cuda,
"xpu": xpu,
"profile_memory": profile_memory,
"with_stack": with_stack,
"record_shapes": record_shapes,
Expand Down
Loading