pytorch · wanchaol · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
@@ -27,8 +27,6 @@
     SequenceParallel,
 )
 
-from torch.utils.checkpoint import _pt2_selective_checkpoint_context_fn_gen, checkpoint
-
 from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.logging_utils import logger
 
@@ -45,22 +43,29 @@
 # currently selective per op and per layer checkpointing are supported
 def checkpoint_wrapper(module, config):
     if config.mode == "selective" and config.selective_ac_option == "op":
+        from torch.utils.checkpoint import (
+            CheckpointPolicy,
+            create_selective_checkpoint_contexts,
+        )
 
         def _get_custom_policy(meta):
             def _custom_policy(mode, func, *args, **kwargs):
                 mm_count_key = f"{mode}_mm_count"
                 if func == torch.ops.aten.mm.default:
                     meta[mm_count_key] += 1
                 # Saves output of all compute ops, except every second mm
-                return func in no_recompute_list and not (
+                if func in no_recompute_list and not (
                     func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
-                )
+                ):
+                    return CheckpointPolicy.MUST_SAVE
+                else:
+                    return CheckpointPolicy.PREFER_RECOMPUTE
 
             return _custom_policy
 
         def selective_checkpointing_context_fn():
             meta = defaultdict(int)
-            return _pt2_selective_checkpoint_context_fn_gen(_get_custom_policy(meta))
+            return create_selective_checkpoint_contexts(_get_custom_policy(meta))
 
         return ptd_checkpoint_wrapper(
             module,
@@ -86,15 +91,15 @@ def selective_checkpointing_context_fn():
         1 == checkpointing every one (all).
         2 == checkpoint every 2nd one
         """
-        every_x_layer = int(config.selective_ac_option)
+        ac_freq = int(config.selective_ac_option)
         assert (
-            every_x_layer >= 0
-        ), f"selective layer AC policy (every_x_layer) expects a positive integer, received {every_x_layer}"
+            ac_freq >= 0
+        ), f"selective layer AC policy (ac_freq) expects a positive integer, received {ac_freq}"
 
         checkpoint_wrapper.__dict__.setdefault("_count", 0)
 
         checkpoint_wrapper._count += 1
-        if not every_x_layer or checkpoint_wrapper._count % every_x_layer == 0:
+        if not ac_freq or checkpoint_wrapper._count % ac_freq == 0:
             return ptd_checkpoint_wrapper(
                 module,
                 checkpoint_impl=CheckpointImpl.NO_REENTRANT,