Skip to content

Commit

Permalink
[MetaSchedule] Introduce Async Pipeline in MultiLevelTiling (apache#1…
Browse files Browse the repository at this point in the history
…4009)

This PR introduces async pipeline in the current TVM's MultiLevelTiling Rules. This PR is based on apache#13966, which is already merged. This is because some conv2d workload will use `tir.if_then_else` to pad the input to the correct size, and this PR uses async copy in such copy statement.

1. Add a subrule in `src/meta_schedule/schedule_rule/multi_level_tiling.h/.cc` that annotate async copy for mlt in supported arch (>= sm80).

In CUDA Core, this PR has a perf boost of around 1T GFLOP/s in most Conv2d test cases and 1T ~ 2T in most GEMM test cases.

All generated codes, scripts, and traces are available at https://github.com/Rainy-Memory/tvm-async-rule-benchmark.

Currently tested on commit `afbfb7aa7e43732cb716f8e443df696110be6afc` in conv2d NHWC workload, with a RTX 3080 GPU.

**Notice: given the stochastic nature of evolutionary search, perfromance might become worse if enable this PR.**

Workload: Conv2d NHWC

|Shape|Mainline TVM|Mainline TVM with Async|Performance Boost|
|-|-|-|-|
|N=1_H=224_W=224_C=3_K=64_R=7_S=7_STR=2_PAD=3_DIL=1|13838.05219|14687.89452|6.141343581679319%|
|N=1_H=56_W=56_C=64_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|5398.305085|5613.892553|3.9936140067192905%|
|N=1_H=56_W=56_C=64_K=64_R=3_S=3_STR=1_PAD=1_DIL=1|11652.96825|13157.88249|12.91442839038028%|
|N=1_H=56_W=56_C=64_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|10638.8309|11674.68499|9.736540600527816%|
|N=1_H=56_W=56_C=256_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|8692.32829|9469.264089|8.938178277203573%|
|N=1_H=56_W=56_C=256_K=128_R=1_S=1_STR=2_PAD=0_DIL=1|4685.767442|5698.19634|21.606469175684712%|
|N=1_H=28_W=28_C=128_K=128_R=3_S=3_STR=1_PAD=1_DIL=1|9872.787087|10404.60405|5.38669535070061%|
|N=1_H=28_W=28_C=128_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|9974.281496|10073.31657|0.9929043414276753%|
|N=1_H=28_W=28_C=512_K=128_R=1_S=1_STR=1_PAD=0_DIL=1|7075.866932|8564.572712|21.039199780135142%|
|N=1_H=28_W=28_C=512_K=256_R=1_S=1_STR=2_PAD=0_DIL=1|3648.330914|4021.923142|10.240086132713124%|
|N=1_H=14_W=14_C=256_K=256_R=3_S=3_STR=1_PAD=1_DIL=1|8192.954618|9160.182054|11.805599824451525%|
|N=1_H=14_W=14_C=256_K=1024_R=1_S=1_STR=1_PAD=0_DIL=1|8008.870153|9362.825279|16.90569456283206%|
|N=1_H=14_W=14_C=1024_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|5210.062241|6051.208379|16.144646629759908%|
|N=1_H=14_W=14_C=1024_K=512_R=1_S=1_STR=2_PAD=0_DIL=1|2550.787202|3587.902938|40.65865373586739%|
|N=1_H=7_W=7_C=512_K=512_R=3_S=3_STR=1_PAD=1_DIL=1|4350.626084|5432.788068|24.873706981617943%|
|N=1_H=7_W=7_C=512_K=2048_R=1_S=1_STR=1_PAD=0_DIL=1|6672.068026|7663.725217|14.862815953549454%|
|N=1_H=7_W=7_C=2048_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|3142.564263|4297.988014|36.766909259541826%|

Workload: GEMM NN

|Shape|Mainline TVM|Mainline TVM with Async|Performance Boost|
|-|-|-|-|
|M=512_N=256_K=640|8678.46|10607.37|22.226408832903555%|
|M=512_N=384_K=256|8109.13|10290.72|26.902886006267003%|
|M=512_N=512_K=512|11419.83|14000.86|22.601299669084398%|
|M=512_N=3072_K=768|19709.39|18351.61|-6.8890006235606425%|
|M=512_N=768_K=3072|12844.59|13730.88|6.90010346768561%|
|M=896_N=896_K=896|16149.91|16131.39|-0.11467556165947945%|
|M=1024_N=1024_K=1024|18842.11|19662.8|4.355616223448428%|
|M=1152_N=1152_K=1152|15386.79|16736.1|8.769275462913303%|
|M=1536_N=1536_K=1536|18522.67|18872.06|1.88628313304725%|
|M=2048_N=2048_K=2048|19515.42|18874.85|-3.282378754851291%|
|M=3072_N=3072_K=3072|19233.9|19291.42|0.2990553137948975%|
|M=4096_N=4096_K=4096|17122.17|19259.01|12.479960191961652%|
  • Loading branch information
cblmemo authored and yongwww committed Feb 27, 2023
1 parent 083d4dd commit 17b0b50
Show file tree
Hide file tree
Showing 6 changed files with 405 additions and 5 deletions.
56 changes: 56 additions & 0 deletions src/meta_schedule/schedule_rule/multi_level_tiling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,23 @@ void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context)
TVM_PY_LOG(INFO, context->logger) << "'thread_warp_size' is not defined in the target";
}
}
if (Optional<String> opt_sm = context->target.value()->GetAttr<String>("arch")) {
std::string sm = opt_sm.value();
if (support::StartsWith(sm, "sm_")) {
sm = sm.substr(3);
try {
// only sm_80 or higher supports async memcopy
if (std::stoi(sm) >= 80) {
// only stage = 4 & 5 is tested. all integer that is bigger than 2
// is theoretically feasible, but no guarantee for great performance.
this->stages.insert(this->stages.end(), {4, 5});
}
} catch (const std::invalid_argument& e) {
LOG(WARNING) << "ValueError: Unable to parse `target.arch`: " << sm
<< ". Details: " << e.what();
}
}
}
logger = context->logger;
}

Expand Down Expand Up @@ -115,6 +132,8 @@ std::vector<State> MultiLevelTilingNode::ApplySubRules(std::vector<State> states
states = SubRule(std::move(states), [&](State state) { return TileLoopNest(std::move(state)); });
states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(std::move(state)); });
states = SubRule(std::move(states), [&](State state) { return AddReadReuse(std::move(state)); });
states =
SubRule(std::move(states), [&](State state) { return AddAsyncPipeline(std::move(state)); });
return states;
}

Expand Down Expand Up @@ -280,6 +299,43 @@ std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
return results;
}

std::vector<State> MultiLevelTilingNode::AddAsyncPipeline(State state) const {
// For arch that does not support async pipeline, this->stages will be an empty vector
if (r_indices_.size() < 1 || this->stages.empty()) {
return {state};
}
// Current only support default config used by ScheduleRule::DefaultCUDA
// @see src/meta_schedule/schedule_rule/schedule_rule.cc
// check the reduce loop contains exactly 3 for loops
// therefore it matches the notation array size in the following code
tir::StmtSRef r_loop_sref = state->sch->GetSRef(state->tiles[r_indices_[0]].back());
const tir::ForNode* r_for_loop = TVM_SREF_TO_FOR(r_loop_sref);
Array<tir::Stmt> seq = Downcast<tir::SeqStmt>(r_for_loop->body)->seq;
if (seq.size() != 3) {
return {state};
}
for (auto& stmt : seq) {
if (!stmt.as<tir::ForNode>()) {
return {state};
}
}

std::vector<State> ret;
ret.push_back(state);
for (int stage : this->stages) {
State new_state = state->Copy();
LoopRV r_loop_fused = new_state->sch->Fuse(new_state->tiles[r_indices_[0]]);
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_stage,
Array<Integer>{0, 0, stage - 2});
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_order,
Array<Integer>{0, 1, 2});
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_async_stages,
Array<Integer>{0});
ret.push_back(std::move(new_state));
}
return ret;
}

void MultiLevelTilingNode::AnnotateCooperativeFetching(Schedule* sch,
const tir::BlockRV& block) const {
// Filter out invalid vector lanes according to the data type.
Expand Down
4 changes: 4 additions & 0 deletions src/meta_schedule/schedule_rule/multi_level_tiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
std::vector<State> TileLoopNest(State state) const;
// SubRule 3. add read cache
std::vector<State> AddReadReuse(State state) const;
// SubRule 4. add async pipeline
std::vector<State> AddAsyncPipeline(State state) const;

// Do nothing; Inherited from ScheduleRuleNode
void InitializeWithTuneContext(const TuneContext& context) final;
Expand Down Expand Up @@ -192,6 +194,8 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
int thread_warp_size_;
/*! \brief The maximum number of threads to be used size of a thread warp */
int max_threads_per_block_;
/*! \brief All available async pipeline stages. */
std::vector<int> stages;
/*! \brief The logging function */
PackedFunc logger;
/*! \brief The function to overwrite the default condition for applying MultiLevelTiling. */
Expand Down
6 changes: 3 additions & 3 deletions tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def cuda_matmul_0(
actual = generate_design_space(
kind="cuda",
mod=mod,
target=Target("nvidia/geforce-rtx-3080"),
target=Target("nvidia/geforce-rtx-2080"), # disable async trace using sm75
types=ms.schedule_rule.MultiLevelTiling,
)
check_sketches(
Expand Down Expand Up @@ -483,7 +483,7 @@ def cuda_matmul_relu_0(
actual = generate_design_space(
kind="cuda",
mod=mod,
target=Target("nvidia/geforce-rtx-3080"),
target=Target("nvidia/geforce-rtx-2080"), # disable async trace using sm75
types=ms.schedule_rule.MultiLevelTiling,
)
check_sketches(
Expand Down Expand Up @@ -723,7 +723,7 @@ def cache_read_specify_consumer_0(
space = generate_design_space(
kind="cuda",
mod=mod,
target=Target("nvidia/geforce-rtx-3080"),
target=Target("nvidia/geforce-rtx-2080"), # disable async trace using sm75
types=ms.schedule_rule.MultiLevelTiling,
)
check_sketches(
Expand Down
2 changes: 1 addition & 1 deletion tests/python/unittest/test_meta_schedule_space_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


def _target():
return Target("nvidia/geforce-rtx-3070")
return Target("nvidia/geforce-rtx-2080") # disable async trace using sm75


def _design_space(mod):
Expand Down
Loading

0 comments on commit 17b0b50

Please sign in to comment.