Skip to content

Commit

Permalink
[MetaSchedule] Introduce Async Pipeline in MultiLevelTiling
Browse files Browse the repository at this point in the history
This PR introduces async pipeline in the current TVM's MultiLevelTiling Rules. This PR is blocking on #13966 since some conv2d workload will use `tir.if_then_else` to pad the input to the correct size, and this PR uses async copy in such copy statement.

1. Add a subrule in `src/meta_schedule/schedule_rule/multi_level_tiling.h/.cc` that annotate async copy for mlt in supported arch (>= sm80).

In CUDA Core, this PR has a perf boost of around 1T GFLOP/s in most Conv2d test cases and 1T ~ 2T in most GEMM test cases.

All generated codes, scripts, and traces are available at https://github.com/Rainy-Memory/tvm-async-rule-benchmark.

Currently tested on commit `afbfb7aa7e43732cb716f8e443df696110be6afc` in conv2d NHWC workload, with a RTX 3080 GPU.

**Notice: given the stochastic nature of evolutionary search, perfromance might become worse if enable this PR.**

Workload: Conv2d NHWC

|Shape|Mainline TVM|Mainline TVM with Async|Performance Boost|
|-|-|-|-|
|N=1_H=224_W=224_C=3_K=64_R=7_S=7_STR=2_PAD=3_DIL=1|13838.05219|14687.89452|6.141343581679319%|
|N=1_H=56_W=56_C=64_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|5398.305085|5613.892553|3.9936140067192905%|
|N=1_H=56_W=56_C=64_K=64_R=3_S=3_STR=1_PAD=1_DIL=1|11652.96825|13157.88249|12.91442839038028%|
|N=1_H=56_W=56_C=64_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|10638.8309|11674.68499|9.736540600527816%|
|N=1_H=56_W=56_C=256_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|8692.32829|9469.264089|8.938178277203573%|
|N=1_H=56_W=56_C=256_K=128_R=1_S=1_STR=2_PAD=0_DIL=1|4685.767442|5698.19634|21.606469175684712%|
|N=1_H=28_W=28_C=128_K=128_R=3_S=3_STR=1_PAD=1_DIL=1|9872.787087|10404.60405|5.38669535070061%|
|N=1_H=28_W=28_C=128_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|9974.281496|10073.31657|0.9929043414276753%|
|N=1_H=28_W=28_C=512_K=128_R=1_S=1_STR=1_PAD=0_DIL=1|7075.866932|8564.572712|21.039199780135142%|
|N=1_H=28_W=28_C=512_K=256_R=1_S=1_STR=2_PAD=0_DIL=1|3648.330914|4021.923142|10.240086132713124%|
|N=1_H=14_W=14_C=256_K=256_R=3_S=3_STR=1_PAD=1_DIL=1|8192.954618|9160.182054|11.805599824451525%|
|N=1_H=14_W=14_C=256_K=1024_R=1_S=1_STR=1_PAD=0_DIL=1|8008.870153|9362.825279|16.90569456283206%|
|N=1_H=14_W=14_C=1024_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|5210.062241|6051.208379|16.144646629759908%|
|N=1_H=14_W=14_C=1024_K=512_R=1_S=1_STR=2_PAD=0_DIL=1|2550.787202|3587.902938|40.65865373586739%|
|N=1_H=7_W=7_C=512_K=512_R=3_S=3_STR=1_PAD=1_DIL=1|4350.626084|5432.788068|24.873706981617943%|
|N=1_H=7_W=7_C=512_K=2048_R=1_S=1_STR=1_PAD=0_DIL=1|6672.068026|7663.725217|14.862815953549454%|
|N=1_H=7_W=7_C=2048_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|3142.564263|4297.988014|36.766909259541826%|

Workload: GEMM NN

|Shape|Mainline TVM|Mainline TVM with Async|Performance Boost|
|-|-|-|-|
|M=512_N=256_K=640|8678.46|10607.37|22.226408832903555%|
|M=512_N=384_K=256|8109.13|10290.72|26.902886006267003%|
|M=512_N=512_K=512|11419.83|14000.86|22.601299669084398%|
|M=512_N=3072_K=768|19709.39|18351.61|-6.8890006235606425%|
|M=512_N=768_K=3072|12844.59|13730.88|6.90010346768561%|
|M=896_N=896_K=896|16149.91|16131.39|-0.11467556165947945%|
|M=1024_N=1024_K=1024|18842.11|19662.8|4.355616223448428%|
|M=1152_N=1152_K=1152|15386.79|16736.1|8.769275462913303%|
|M=1536_N=1536_K=1536|18522.67|18872.06|1.88628313304725%|
|M=2048_N=2048_K=2048|19515.42|18874.85|-3.282378754851291%|
|M=3072_N=3072_K=3072|19233.9|19291.42|0.2990553137948975%|
|M=4096_N=4096_K=4096|17122.17|19259.01|12.479960191961652%|
  • Loading branch information
cblmemo committed Feb 25, 2023
1 parent d7253fb commit 09b18ac
Show file tree
Hide file tree
Showing 6 changed files with 405 additions and 5 deletions.
56 changes: 56 additions & 0 deletions src/meta_schedule/schedule_rule/multi_level_tiling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,23 @@ void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context)
TVM_PY_LOG(INFO, context->logger) << "'thread_warp_size' is not defined in the target";
}
}
if (Optional<String> opt_sm = context->target.value()->GetAttr<String>("arch")) {
std::string sm = opt_sm.value();
if (support::StartsWith(sm, "sm_")) {
sm = sm.substr(3);
try {
// only sm_80 or higher supports async memcopy
if (std::stoi(sm) >= 80) {
// only stage = 4 & 5 is tested. all integer that is bigger than 2
// is theoretically feasible, but no guarantee for great performance.
this->stages.insert(this->stages.end(), {4, 5});
}
} catch (const std::invalid_argument& e) {
LOG(WARNING) << "ValueError: Unable to parse `target.arch`: " << sm
<< ". Details: " << e.what();
}
}
}
logger = context->logger;
}

Expand Down Expand Up @@ -115,6 +132,8 @@ std::vector<State> MultiLevelTilingNode::ApplySubRules(std::vector<State> states
states = SubRule(std::move(states), [&](State state) { return TileLoopNest(std::move(state)); });
states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(std::move(state)); });
states = SubRule(std::move(states), [&](State state) { return AddReadReuse(std::move(state)); });
states =
SubRule(std::move(states), [&](State state) { return AddAsyncPipeline(std::move(state)); });
return states;
}

Expand Down Expand Up @@ -280,6 +299,43 @@ std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
return results;
}

std::vector<State> MultiLevelTilingNode::AddAsyncPipeline(State state) const {
// For arch that does not support async pipeline, this->stages will be an empty vector
if (r_indices_.size() < 1 || this->stages.empty()) {
return {state};
}
// Current only support default config used by ScheduleRule::DefaultCUDA
// @see src/meta_schedule/schedule_rule/schedule_rule.cc
// check the reduce loop contains exactly 3 for loops
// therefore it matches the notation array size in the following code
tir::StmtSRef r_loop_sref = state->sch->GetSRef(state->tiles[r_indices_[0]].back());
const tir::ForNode* r_for_loop = TVM_SREF_TO_FOR(r_loop_sref);
Array<tir::Stmt> seq = Downcast<tir::SeqStmt>(r_for_loop->body)->seq;
if (seq.size() != 3) {
return {state};
}
for (auto& stmt : seq) {
if (!stmt.as<tir::ForNode>()) {
return {state};
}
}

std::vector<State> ret;
ret.push_back(state);
for (int stage : this->stages) {
State new_state = state->Copy();
LoopRV r_loop_fused = new_state->sch->Fuse(new_state->tiles[r_indices_[0]]);
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_stage,
Array<Integer>{0, 0, stage - 2});
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_order,
Array<Integer>{0, 1, 2});
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_async_stages,
Array<Integer>{0});
ret.push_back(std::move(new_state));
}
return ret;
}

void MultiLevelTilingNode::AnnotateCooperativeFetching(Schedule* sch,
const tir::BlockRV& block) const {
// Filter out invalid vector lanes according to the data type.
Expand Down
4 changes: 4 additions & 0 deletions src/meta_schedule/schedule_rule/multi_level_tiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
std::vector<State> TileLoopNest(State state) const;
// SubRule 3. add read cache
std::vector<State> AddReadReuse(State state) const;
// SubRule 4. add async pipeline
std::vector<State> AddAsyncPipeline(State state) const;

// Do nothing; Inherited from ScheduleRuleNode
void InitializeWithTuneContext(const TuneContext& context) final;
Expand Down Expand Up @@ -192,6 +194,8 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
int thread_warp_size_;
/*! \brief The maximum number of threads to be used size of a thread warp */
int max_threads_per_block_;
/*! \brief All available async pipeline stages. */
std::vector<int> stages;
/*! \brief The logging function */
PackedFunc logger;
/*! \brief The function to overwrite the default condition for applying MultiLevelTiling. */
Expand Down
6 changes: 3 additions & 3 deletions tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def cuda_matmul_0(
actual = generate_design_space(
kind="cuda",
mod=mod,
target=Target("nvidia/geforce-rtx-3080"),
target=Target("nvidia/geforce-rtx-2080"), # disable async trace using sm75
types=ms.schedule_rule.MultiLevelTiling,
)
check_sketches(
Expand Down Expand Up @@ -483,7 +483,7 @@ def cuda_matmul_relu_0(
actual = generate_design_space(
kind="cuda",
mod=mod,
target=Target("nvidia/geforce-rtx-3080"),
target=Target("nvidia/geforce-rtx-2080"), # disable async trace using sm75
types=ms.schedule_rule.MultiLevelTiling,
)
check_sketches(
Expand Down Expand Up @@ -723,7 +723,7 @@ def cache_read_specify_consumer_0(
space = generate_design_space(
kind="cuda",
mod=mod,
target=Target("nvidia/geforce-rtx-3080"),
target=Target("nvidia/geforce-rtx-2080"), # disable async trace using sm75
types=ms.schedule_rule.MultiLevelTiling,
)
check_sketches(
Expand Down
2 changes: 1 addition & 1 deletion tests/python/unittest/test_meta_schedule_space_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


def _target():
return Target("nvidia/geforce-rtx-3070")
return Target("nvidia/geforce-rtx-2080") # disable async trace using sm75


def _design_space(mod):
Expand Down
Loading

0 comments on commit 09b18ac

Please sign in to comment.