Skip to content

Commit

Permalink
[Relay][Strategy] Use x86 dense schedules for arm_cpu
Browse files Browse the repository at this point in the history
Currently the fallback used when compiling a dense operation with
targets such as `llvm -device=arm_cpu` is `dense.generic`. This results
very poor performance. Although #13775
meant that x86 schedules are used in cases where no strategy is provided
by arm_cpu, the dense strategy is registered due to the existance of
specialized schedules for arm_cpu e.g. a schedule for embedded devices.
This commit ensures x86 schedules are used inplace of a generic
schedule which yeilds much better performance.

The commit also follows the same approach for the `dense.generic`
schedule as the x86 strategy. This will only be used when autoscheduler
is enabled.

A test has been added to check the intended schedules are picked when
compiling with `arm_cpu`.

Change-Id: I8697f630d4acfab71a9626cf9e0dc3086987f163
  • Loading branch information
lhutton1 committed Aug 4, 2023
1 parent 63a95d5 commit c02ff2e
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 21 deletions.
62 changes: 41 additions & 21 deletions python/tvm/relay/op/strategy/arm_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,33 +559,53 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
name="dense_dsp.arm_cpu",
)
else:
# For dynamic matrix-vector multiply we use a hand written kernel.
if (
isinstance(inputs[0].shape[0], (int, tir.IntImm))
and inputs[0].shape[0] == 1
and (
topi.utils.is_dynamic_shape(inputs[0].shape)
or topi.utils.is_dynamic_shape(inputs[1].shape)
)
):
strategy.add_implementation(
wrap_compute_dense(topi.x86.dense_dynamic),
wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
name="dense_dynamic.x86",
plevel=20,
)
return strategy
logger.warning("dense is not optimized for arm cpu.")
return strategy

# For dynamic matrix-vector multiply we use a hand written kernel.
if (
isinstance(inputs[0].shape[0], (int, tir.IntImm))
and inputs[0].shape[0] == 1
and (
topi.utils.is_dynamic_shape(inputs[0].shape)
or topi.utils.is_dynamic_shape(inputs[1].shape)
)
):
strategy.add_implementation(
wrap_compute_dense(topi.x86.dense_dynamic),
wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
name="dense_dynamic.x86",
plevel=20,
)
return strategy

need_auto_scheduler_layout = is_auto_scheduler_enabled()
need_meta_schedule_layout = is_meta_schedule_enabled()
if need_auto_scheduler_layout or need_meta_schedule_layout:
strategy.add_implementation(
wrap_compute_dense(
topi.nn.dense,
need_auto_scheduler_layout=is_auto_scheduler_enabled(),
need_meta_schedule_layout=is_meta_schedule_enabled(),
need_auto_scheduler_layout=need_auto_scheduler_layout,
need_meta_schedule_layout=need_meta_schedule_layout,
),
wrap_topi_schedule(topi.generic.schedule_dense),
naive_schedule,
name="dense.generic",
plevel=11,
)

# Fallback to x86 schedules as there is currently no arm_cpu schedule for dense
strategy.add_implementation(
wrap_compute_dense(topi.x86.dense_nopack),
wrap_topi_schedule(topi.x86.schedule_dense_nopack),
name="dense_nopack.x86",
plevel=5,
)
strategy.add_implementation(
wrap_compute_dense(topi.x86.dense_pack),
wrap_topi_schedule(topi.x86.schedule_dense_pack),
name="dense_pack.x86",
plevel=10,
)

return strategy


Expand Down
38 changes: 38 additions & 0 deletions tests/python/relay/strategy/test_select_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
# under the License.

""" Tests strategy selection for Relay ops """

import pytest
import numpy as np

import tvm
from tvm import relay
from tvm import te
Expand Down Expand Up @@ -149,5 +152,40 @@ def test_int8_depthwise_conv2d(target, expected_impl):
assert impl.name == expected_impl


@pytest.mark.parametrize(
"target,expected_valid_impl,expected_impl",
[("llvm -device=arm_cpu", ["dense_pack.x86", "dense_nopack.x86"], "dense_pack.x86")],
)
def test_dense(target, expected_valid_impl, expected_impl):
target = tvm.target.Target(target)

data_shape = (30, 40)
weight_shape = (30, 40)
dtype = "float32"

out = relay.nn.dense(
relay.var("data", shape=data_shape, dtype=dtype),
relay.var("weight", shape=weight_shape, dtype=dtype),
out_dtype=dtype,
)
out = run_infer_type(out)

with target:
args = [
out.op,
out.attrs,
[te.placeholder(data_shape, dtype), te.placeholder(weight_shape, dtype)],
out.checked_type,
target,
]
valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
selected_impl, _ = relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)

assert len(valid_impl) == len(expected_valid_impl)
for impl in valid_impl:
assert impl.name in expected_valid_impl
assert selected_impl.name == expected_impl


if __name__ == "__main__":
tvm.testing.main()

0 comments on commit c02ff2e

Please sign in to comment.