Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MetaSchedule] Add software pipeline in CUDA tensor core auto tensorization #12544

Merged
merged 2 commits into from
Aug 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/tvm/meta_schedule/schedule_rule.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,14 @@ class ScheduleRule : public runtime::ObjectRef {
* NullOpt means disable vectorization
* \param reuse_read Data reuse configuration for reading. NullOpt means no reuse.
* \param reuse_write Data reuse configuration for writing. NullOpt means no reuse.
* \param use_software_pipeline Whether use the software pipeline.
* \return The schedule rule created
*/
TVM_DLL static ScheduleRule MultiLevelTilingTensorCore(
Array<Map<String, String>> intrin_groups, String structure,
Optional<Array<String>> tile_binds, Optional<Integer> max_innermost_factor,
Optional<Array<Integer>> vector_load_lens, Optional<Map<String, ObjectRef>> reuse_read,
Optional<Map<String, ObjectRef>> reuse_write);
Optional<Map<String, ObjectRef>> reuse_write, bool use_software_pipeline);

/*!
* \brief Create a rule: add-rfactor to some blocks if needed
Expand Down
1 change: 1 addition & 0 deletions python/tvm/meta_schedule/default_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ def schedule_rules():
levels=[2],
scope="shared",
),
use_software_pipeline=False,
),
*_DefaultCUDA.schedule_rules(),
]
Expand Down
4 changes: 4 additions & 0 deletions python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ class MultiLevelTilingTensorCore(ScheduleRule):
Data reuse configuration for reading. None means no reuse.
reuse_write : Optional[ReuseType]
Data reuse configuration for writing. None means no reuse.
use_software_pipeline : bool
Whether to use the software pipeline.
"""

def __init__(
Expand All @@ -172,6 +174,7 @@ def __init__(
vector_load_lens: Optional[List[int]] = None,
reuse_read: Optional[ReuseType] = None,
reuse_write: Optional[ReuseType] = None,
use_software_pipeline: bool = False,
vinx13 marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
self.__init_handle_by_constructor__(
_ffi_api.ScheduleRuleMultiLevelTilingTensorCore, # type: ignore # pylint: disable=no-member
Expand All @@ -182,4 +185,5 @@ def __init__(
vector_load_lens,
reuse_read.as_dict() if reuse_read is not None else None,
reuse_write.as_dict() if reuse_write is not None else None,
use_software_pipeline,
)
2 changes: 2 additions & 0 deletions python/tvm/meta_schedule/testing/schedule_rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def multi_level_tiling_tensor_core(
in_dtype: Union[str, List[str]] = "float16",
out_dtype: Union[str, List[str]] = "float32",
trans_b: Union[bool, List[bool]] = False,
use_software_pipeline: bool = False,
) -> ScheduleRule:
"""Default schedule rules for with multi-level tiling reuse for tensor core"""
assert write_reuse_scope in ["shared", "global"]
Expand Down Expand Up @@ -154,6 +155,7 @@ def multi_level_tiling_tensor_core(
levels=[2],
scope=write_reuse_scope,
),
use_software_pipeline=use_software_pipeline,
)
raise NotImplementedError(f"{target.kind.name} is not supported")

Expand Down
122 changes: 121 additions & 1 deletion src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
inline std::vector<State> AddReadReuseTensorCore(TensorCoreState state) const;
// Subrule: Add tensorized store
inline std::vector<State> AddWriteReuseTensorCore(TensorCoreState state) const;
// Subrule: Add software pipeline
inline std::vector<State> AddSoftwarePipeline(TensorCoreState state) const;

// Override ApplySubRules to apply tensorization-specific sub-rules
std::vector<State> ApplySubRules(std::vector<State> states) final;
Expand Down Expand Up @@ -155,6 +157,8 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
public:
/*! \brief The candidate tensor core intrin groups to apply */
std::vector<TensorCoreIntrinGroup> intrin_groups;
/*! \brief Whether to use software pipeline */
bool use_software_pipeline = false;
static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingTensorCore";
TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingTensorCoreNode, MultiLevelTilingNode);

Expand Down Expand Up @@ -222,6 +226,9 @@ std::vector<State> MultiLevelTilingTensorCoreNode::ApplySubRules(std::vector<Sta
states = SubRule(std::move(states), [&](State state) {
return AddReadReuseTensorCore(Downcast<TensorCoreState>(state));
});
states = SubRule(std::move(states), [&](State state) {
return AddSoftwarePipeline(Downcast<TensorCoreState>(state));
});
return states;
}

Expand Down Expand Up @@ -286,6 +293,117 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
return {state};
}

std::vector<State> MultiLevelTilingTensorCoreNode::AddSoftwarePipeline(
TensorCoreState state) const {
if (!use_software_pipeline) {
return {state};
}
// The current config is not suitable for software pipelining.
if (r_indices_.size() < 2) {
return {state};
}

Schedule& sch = state->sch;
// Check reduction length after blockize.
int64_t reduction_length = 1;
for (int r_index : r_indices_) {
const Array<LoopRV>& tiles = state->tiles[r_index];
for (const LoopRV& tile : tiles) {
const auto* extent = sch->Get(tile)->extent.as<IntImmNode>();
ICHECK(extent != nullptr) << "Dynamic extent is not supported.";
reduction_length *= extent->value;
}
}
if (reduction_length <= 1) {
return {state};
}

// Add local stage and double buffering
for (int i = 0; i < 2; ++i) {
const tir::BlockRV cache_read = state->read_reuse.at(i);
sch->Annotate(cache_read, tir::attr::manifest_shared_memory_local_stage, Bool(true));
sch->Annotate(cache_read, tir::attr::double_buffer_scope, Integer(0));
}

// Add annotations of software pipeline
//
// Before pipelining, the original loop can be expressed as the pseudo code below:
//
// for k0 in [0, K0):
// load tile k0 to registers
// load tile k0 from registers to shared memory
//
// for k1 in [0, K1):
// load fragment k1 of tile k0
// compute matmul with fragment k1
//

// Inner software pipeline: Prefetch to tensor core fragment by one iteration
// The following annotation for the inner loop is equivalent the pesudo code below:
//
// Pipelined inner loop:
//
// prologue:
// load fragment 0
// body:
// for k1 in [0, K1 - 1):
// load fragment k1 + 1
// compute matmul with fragment k1
// epilogue:
// compute matmul with fragment K1 - 1
//
sch->Annotate(state->tiles[r_indices_[1]].back(), tir::attr::software_pipeline_stage,
Array<Integer>{0, 0, 1});
sch->Annotate(state->tiles[r_indices_[1]].back(), tir::attr::software_pipeline_order,
Array<Integer>{0, 1, 2});
// Outer software pipeline: Interleave the outer loop with the (pipelined) inner loop.
// The prefetching stage of the inner pipeline is executed by one iteration in the outer loop.
// The following annotation for the outer loop is equivalent the pesudo code below:
//
// Pipelined outer loop with nested inner pipeline:
//
// prologue:
// load tile 0 to registers
// load tile 0 from registers to shared memory
//
// // prologue of the inner pipeline
// load fragment 0 of tile 0
//
// body:
// for k0 in [0, K0 - 1):
// load tile k0 + 1 to registers
//
// // body of the inner pipeline
// for k1 in [0, K1 - 1):
// load fragment k1 + 1 of tile k0
// compute matmul with fragment k1 of tile k0
//
// load tile k0 + 1 from registers to shared memory
//
// // prologue of the inner pipeline
// load fragment 0 of tile k0 + 1
//
// // epilogue of the inner pipeline
// compute matmul with fragment K1 - 1 of tile k0
//
// epilogue:
//
// // body of the inner pipeline
// for k1 in [0, K1 - 1):
// load fragment k1 + 1 of tile K0 - 1
// compute matmul with fragment k1 of tile K0 - 1
//
// // epilogue of the inner pipeline
// compute matmul with fragment K1 - 1 of tile K0 - 1
//
sch->Annotate(state->tiles[r_indices_[0]].back(), tir::attr::software_pipeline_stage,
Array<Integer>{0, 0, 0, 0, 0, 1, 1});
sch->Annotate(state->tiles[r_indices_[0]].back(), tir::attr::software_pipeline_order,
Array<Integer>{0, 3, 1, 4, 5, 2, 6});
vinx13 marked this conversation as resolved.
Show resolved Hide resolved

return {state};
}

Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
TensorCoreStateNode* state, const String& intrin_name) const {
BlockRV block_rv = state->block_rv;
Expand Down Expand Up @@ -418,14 +536,16 @@ inline std::vector<State> MultiLevelTilingTensorCoreNode::TransformForTensorizat
ScheduleRule ScheduleRule::MultiLevelTilingTensorCore(
Array<Map<String, String>> intrin_groups, String structure, Optional<Array<String>> tile_binds,
Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write) {
Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write,
bool use_software_pipeline) {
auto node = MultiLevelTilingInitCommon<MultiLevelTilingTensorCoreNode>(
structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);

node->intrin_groups.reserve(intrin_groups.size());
for (const auto& intrin_group_config : intrin_groups) {
node->intrin_groups.emplace_back(TensorCoreIntrinGroup::FromConfig(intrin_group_config));
}
node->use_software_pipeline = use_software_pipeline;
return ScheduleRule(node);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,131 @@ def test_cuda_tensor_core_matmul_relu():
check_trace(spaces, expected)


def test_cuda_tensor_core_software_pipeline_matmul_relu():
m = n = k = 128
target = Target("cuda", host="llvm")
ctx = _create_context(
create_prim_func(
te_workload.matmul_relu(
n=n,
m=m,
k=k,
in_dtype="float16",
out_dtype="float32",
)
),
target=target,
rule=[
multi_level_tiling_tensor_core(
target=target, write_reuse_scope="shared", use_software_pipeline=True
),
auto_inline(target),
],
)
spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
assert len(spaces) == 1

expected = [
"""b0 = sch.get_block(name="C", func_name="main")
b1 = sch.get_block(name="compute", func_name="main")
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
b2 = sch.reindex(block=b0, buffer=("write", 0))
b3 = sch.reindex(block=b0, buffer=("read", 0))
b4 = sch.reindex(block=b0, buffer=("read", 1))
sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, ))
sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, ))
sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
l5, l6, l7 = sch.get_loops(block=b0)
l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True)
l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0)
sch.reorder(l16, l18, l13, l11, l9)
b20 = sch.blockize(loop=l13)
sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1)
l21, l22, l23 = sch.get_loops(block=b20)
v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True)
v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4)
l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True)
v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4)
l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True)
sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43)
l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
sch.bind(loop=l50, thread_axis="blockIdx.y")
l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
sch.bind(loop=l51, thread_axis="blockIdx.x")
l52 = sch.fuse(l31, l41, preserve_unit_iters=True)
sch.bind(loop=l52, thread_axis="threadIdx.y")
b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared")
sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True)
b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator")
sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True)
v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55)
sch.reverse_compute_inline(block=b2)
l56, l57, l58, l59, l60 = sch.get_loops(block=b54)
l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True)
l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True)
l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54)
sch.reorder(l70, l64, l62)
b72 = sch.blockize(loop=l64)
sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared")
sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True)
l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73)
l80 = sch.fuse(l78, l79, preserve_unit_iters=True)
v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared")
sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True)
l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82)
l89 = sch.fuse(l87, l88, preserve_unit_iters=True)
v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25])
sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90)
b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a")
sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True)
l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91)
l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True)
l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True)
l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91)
sch.reorder(l110, l102, l100)
b112 = sch.blockize(loop=l102)
sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b")
sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True)
l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113)
l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True)
l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True)
l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113)
sch.reorder(l132, l124, l122)
b134 = sch.blockize(loop=l124)
sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
sch.compute_inline(block=b3)
sch.compute_inline(block=b4)
sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8)
sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8)
sch.annotate(block_or_loop=b73, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1)
sch.annotate(block_or_loop=b73, ann_key="double_buffer_scope", ann_val=0)
sch.annotate(block_or_loop=b82, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1)
sch.annotate(block_or_loop=b82, ann_key="double_buffer_scope", ann_val=0)
sch.annotate(block_or_loop=l48, ann_key="software_pipeline_stage", ann_val=[0, 0, 1])
sch.annotate(block_or_loop=l48, ann_key="software_pipeline_order", ann_val=[0, 1, 2])
sch.annotate(block_or_loop=l47, ann_key="software_pipeline_stage", ann_val=[0, 0, 0, 0, 0, 1, 1])
sch.annotate(block_or_loop=l47, ann_key="software_pipeline_order", ann_val=[0, 3, 1, 4, 5, 2, 6])
sch.reverse_compute_inline(block=b1)""".split(
"\n"
)
]
check_trace(spaces, expected)


def test_cuda_tensor_core_matmul_relu_global():
m = n = k = 128
target = Target("cuda", host="llvm")
Expand Down