apache · masahi · Aug 25, 2022 · Aug 22, 2022 · Aug 24, 2022
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
@@ -190,13 +190,14 @@ class ScheduleRule : public runtime::ObjectRef {
    * NullOpt means disable vectorization
    * \param reuse_read Data reuse configuration for reading. NullOpt means no reuse.
    * \param reuse_write Data reuse configuration for writing. NullOpt means no reuse.
+   * \param use_software_pipeline Whether use the software pipeline.
    * \return The schedule rule created
    */
   TVM_DLL static ScheduleRule MultiLevelTilingTensorCore(
       Array<Map<String, String>> intrin_groups, String structure,
       Optional<Array<String>> tile_binds, Optional<Integer> max_innermost_factor,
       Optional<Array<Integer>> vector_load_lens, Optional<Map<String, ObjectRef>> reuse_read,
-      Optional<Map<String, ObjectRef>> reuse_write);
+      Optional<Map<String, ObjectRef>> reuse_write, bool use_software_pipeline);
 
   /*!
    * \brief Create a rule: add-rfactor to some blocks if needed

diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
@@ -381,6 +381,7 @@ def schedule_rules():
                     levels=[2],
                     scope="shared",
                 ),
+                use_software_pipeline=False,
             ),
             *_DefaultCUDA.schedule_rules(),
         ]

diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
@@ -161,6 +161,8 @@ class MultiLevelTilingTensorCore(ScheduleRule):
         Data reuse configuration for reading. None means no reuse.
     reuse_write : Optional[ReuseType]
         Data reuse configuration for writing. None means no reuse.
+    use_software_pipeline : bool
+        Whether to use the software pipeline.
     """
 
     def __init__(
@@ -172,6 +174,7 @@ def __init__(
         vector_load_lens: Optional[List[int]] = None,
         reuse_read: Optional[ReuseType] = None,
         reuse_write: Optional[ReuseType] = None,
+        use_software_pipeline: bool = False,
     ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.ScheduleRuleMultiLevelTilingTensorCore,  # type: ignore # pylint: disable=no-member
@@ -182,4 +185,5 @@ def __init__(
             vector_load_lens,
             reuse_read.as_dict() if reuse_read is not None else None,
             reuse_write.as_dict() if reuse_write is not None else None,
+            use_software_pipeline,
         )
diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -119,6 +119,7 @@ def multi_level_tiling_tensor_core(
     in_dtype: Union[str, List[str]] = "float16",
     out_dtype: Union[str, List[str]] = "float32",
     trans_b: Union[bool, List[bool]] = False,
+    use_software_pipeline: bool = False,
 ) -> ScheduleRule:
     """Default schedule rules for with multi-level tiling reuse for tensor core"""
     assert write_reuse_scope in ["shared", "global"]
@@ -154,6 +155,7 @@ def multi_level_tiling_tensor_core(
                 levels=[2],
                 scope=write_reuse_scope,
             ),
+            use_software_pipeline=use_software_pipeline,
         )
     raise NotImplementedError(f"{target.kind.name} is not supported")
 

diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -128,6 +128,8 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
   inline std::vector<State> AddReadReuseTensorCore(TensorCoreState state) const;
   // Subrule: Add tensorized store
   inline std::vector<State> AddWriteReuseTensorCore(TensorCoreState state) const;
+  // Subrule: Add software pipeline
+  inline std::vector<State> AddSoftwarePipeline(TensorCoreState state) const;
 
   // Override ApplySubRules to apply tensorization-specific sub-rules
   std::vector<State> ApplySubRules(std::vector<State> states) final;
@@ -155,6 +157,8 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
  public:
   /*! \brief The candidate tensor core intrin groups to apply */
   std::vector<TensorCoreIntrinGroup> intrin_groups;
+  /*! \brief Whether to use software pipeline */
+  bool use_software_pipeline = false;
   static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingTensorCore";
   TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingTensorCoreNode, MultiLevelTilingNode);
 
@@ -222,6 +226,9 @@ std::vector<State> MultiLevelTilingTensorCoreNode::ApplySubRules(std::vector<Sta
   states = SubRule(std::move(states), [&](State state) {
     return AddReadReuseTensorCore(Downcast<TensorCoreState>(state));
   });
+  states = SubRule(std::move(states), [&](State state) {
+    return AddSoftwarePipeline(Downcast<TensorCoreState>(state));
+  });
   return states;
 }
 
@@ -286,6 +293,117 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
   return {state};
 }
 
+std::vector<State> MultiLevelTilingTensorCoreNode::AddSoftwarePipeline(
+    TensorCoreState state) const {
+  if (!use_software_pipeline) {
+    return {state};
+  }
+  // The current config is not suitable for software pipelining.
+  if (r_indices_.size() < 2) {
+    return {state};
+  }
+
+  Schedule& sch = state->sch;
+  // Check reduction length after blockize.
+  int64_t reduction_length = 1;
+  for (int r_index : r_indices_) {
+    const Array<LoopRV>& tiles = state->tiles[r_index];
+    for (const LoopRV& tile : tiles) {
+      const auto* extent = sch->Get(tile)->extent.as<IntImmNode>();
+      ICHECK(extent != nullptr) << "Dynamic extent is not supported.";
+      reduction_length *= extent->value;
+    }
+  }
+  if (reduction_length <= 1) {
+    return {state};
+  }
+
+  // Add local stage and double buffering
+  for (int i = 0; i < 2; ++i) {
+    const tir::BlockRV cache_read = state->read_reuse.at(i);
+    sch->Annotate(cache_read, tir::attr::manifest_shared_memory_local_stage, Bool(true));
+    sch->Annotate(cache_read, tir::attr::double_buffer_scope, Integer(0));
+  }
+
+  // Add annotations of software pipeline
+  //
+  // Before pipelining, the original loop can be expressed as the pseudo code below:
+  //
+  // for k0 in [0, K0):
+  //   load tile k0 to registers
+  //   load tile k0 from registers to shared memory
+  //
+  //   for k1 in [0, K1):
+  //     load fragment k1 of tile k0
+  //     compute matmul with fragment k1
+  //
+
+  // Inner software pipeline: Prefetch to tensor core fragment by one iteration
+  // The following annotation for the inner loop is equivalent the pesudo code below:
+  //
+  // Pipelined inner loop:
+  //
+  // prologue:
+  //   load fragment 0
+  // body:
+  //   for k1 in [0, K1 - 1):
+  //     load fragment k1 + 1
+  //     compute matmul with fragment k1
+  // epilogue:
+  //   compute matmul with fragment K1 - 1
+  //
+  sch->Annotate(state->tiles[r_indices_[1]].back(), tir::attr::software_pipeline_stage,
+                Array<Integer>{0, 0, 1});
+  sch->Annotate(state->tiles[r_indices_[1]].back(), tir::attr::software_pipeline_order,
+                Array<Integer>{0, 1, 2});
+  // Outer software pipeline: Interleave the outer loop with the (pipelined) inner loop.
+  // The prefetching stage of the inner pipeline is executed by one iteration in the outer loop.
+  // The following annotation for the outer loop is equivalent the pesudo code below:
+  //
+  // Pipelined outer loop with nested inner pipeline:
+  //
+  // prologue:
+  //   load tile 0 to registers
+  //   load tile 0 from registers to shared memory
+  //
+  //   // prologue of the inner pipeline
+  //   load fragment 0 of tile 0
+  //
+  // body:
+  //   for k0 in [0, K0 - 1):
+  //     load tile k0 + 1 to registers
+  //
+  //     // body of the inner pipeline
+  //     for k1 in [0, K1 - 1):
+  //       load fragment k1 + 1 of tile k0
+  //       compute matmul with fragment k1 of tile k0
+  //
+  //     load tile k0 + 1 from registers to shared memory
+  //
+  //     // prologue of the inner pipeline
+  //     load fragment 0 of tile k0 + 1
+  //
+  //     // epilogue of the inner pipeline
+  //     compute matmul with fragment K1 - 1 of tile k0
+  //
+  // epilogue:
+  //
+  //   // body of the inner pipeline
+  //   for k1 in [0, K1 - 1):
+  //     load fragment k1 + 1 of tile K0 - 1
+  //     compute matmul with fragment k1 of tile K0 - 1
+  //
+  //   // epilogue of the inner pipeline
+  //   compute matmul with fragment K1 - 1 of tile K0 - 1
+  //
+  sch->Annotate(state->tiles[r_indices_[0]].back(), tir::attr::software_pipeline_stage,
+                Array<Integer>{0, 0, 0, 0, 0, 1, 1});
+  sch->Annotate(state->tiles[r_indices_[0]].back(), tir::attr::software_pipeline_order,
+                Array<Integer>{0, 3, 1, 4, 5, 2, 6});
+
+  return {state};
+}
+
 Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
     TensorCoreStateNode* state, const String& intrin_name) const {
   BlockRV block_rv = state->block_rv;
@@ -418,14 +536,16 @@ inline std::vector<State> MultiLevelTilingTensorCoreNode::TransformForTensorizat
 ScheduleRule ScheduleRule::MultiLevelTilingTensorCore(
     Array<Map<String, String>> intrin_groups, String structure, Optional<Array<String>> tile_binds,
     Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
-    Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write) {
+    Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write,
+    bool use_software_pipeline) {
   auto node = MultiLevelTilingInitCommon<MultiLevelTilingTensorCoreNode>(
       structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);
 
   node->intrin_groups.reserve(intrin_groups.size());
   for (const auto& intrin_group_config : intrin_groups) {
     node->intrin_groups.emplace_back(TensorCoreIntrinGroup::FromConfig(intrin_group_config));
   }
+  node->use_software_pipeline = use_software_pipeline;
   return ScheduleRule(node);
 }
 

diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -709,6 +709,131 @@ def test_cuda_tensor_core_matmul_relu():
     check_trace(spaces, expected)
 
 
+def test_cuda_tensor_core_software_pipeline_matmul_relu():
+    m = n = k = 128
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        create_prim_func(
+            te_workload.matmul_relu(
+                n=n,
+                m=m,
+                k=k,
+                in_dtype="float16",
+                out_dtype="float32",
+            )
+        ),
+        target=target,
+        rule=[
+            multi_level_tiling_tensor_core(
+                target=target, write_reuse_scope="shared", use_software_pipeline=True
+            ),
+            auto_inline(target),
+        ],
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+
+    expected = [
+        """b0 = sch.get_block(name="C", func_name="main")
+b1 = sch.get_block(name="compute", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+b2 = sch.reindex(block=b0, buffer=("write", 0))
+b3 = sch.reindex(block=b0, buffer=("read", 0))
+b4 = sch.reindex(block=b0, buffer=("read", 1))
+sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
+sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, ))
+sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
+sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
+l5, l6, l7 = sch.get_loops(block=b0)
+l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True)
+l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
+l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
+l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0)
+sch.reorder(l16, l18, l13, l11, l9)
+b20 = sch.blockize(loop=l13)
+sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
+sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
+sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1)
+l21, l22, l23 = sch.get_loops(block=b20)
+v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
+l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True)
+v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4)
+l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True)
+v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4)
+l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True)
+sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43)
+l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
+sch.bind(loop=l50, thread_axis="blockIdx.y")
+l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
+sch.bind(loop=l51, thread_axis="blockIdx.x")
+l52 = sch.fuse(l31, l41, preserve_unit_iters=True)
+sch.bind(loop=l52, thread_axis="threadIdx.y")
+b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared")
+sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True)
+b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator")
+sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True)
+v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55)
+sch.reverse_compute_inline(block=b2)
+l56, l57, l58, l59, l60 = sch.get_loops(block=b54)
+l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True)
+l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True)
+l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54)
+sch.reorder(l70, l64, l62)
+b72 = sch.blockize(loop=l64)
+sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
+b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared")
+sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True)
+l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73)
+l80 = sch.fuse(l78, l79, preserve_unit_iters=True)
+v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
+b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared")
+sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True)
+l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82)
+l89 = sch.fuse(l87, l88, preserve_unit_iters=True)
+v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90)
+b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a")
+sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True)
+l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91)
+l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True)
+l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True)
+l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91)
+sch.reorder(l110, l102, l100)
+b112 = sch.blockize(loop=l102)
+sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
+b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b")
+sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True)
+l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113)
+l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True)
+l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True)
+l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113)
+sch.reorder(l132, l124, l122)
+b134 = sch.blockize(loop=l124)
+sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
+sch.compute_inline(block=b3)
+sch.compute_inline(block=b4)
+sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.annotate(block_or_loop=b73, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1)
+sch.annotate(block_or_loop=b73, ann_key="double_buffer_scope", ann_val=0)
+sch.annotate(block_or_loop=b82, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1)
+sch.annotate(block_or_loop=b82, ann_key="double_buffer_scope", ann_val=0)
+sch.annotate(block_or_loop=l48, ann_key="software_pipeline_stage", ann_val=[0, 0, 1])
+sch.annotate(block_or_loop=l48, ann_key="software_pipeline_order", ann_val=[0, 1, 2])
+sch.annotate(block_or_loop=l47, ann_key="software_pipeline_stage", ann_val=[0, 0, 0, 0, 0, 1, 1])
+sch.annotate(block_or_loop=l47, ann_key="software_pipeline_order", ann_val=[0, 3, 1, 4, 5, 2, 6])
+sch.reverse_compute_inline(block=b1)""".split(
+            "\n"
+        )
+    ]
+    check_trace(spaces, expected)
+
+
 def test_cuda_tensor_core_matmul_relu_global():
     m = n = k = 128
     target = Target("cuda", host="llvm")