diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index bbbe5bb73271..24966019db6d 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -559,33 +559,53 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp), name="dense_dsp.arm_cpu", ) - else: - # For dynamic matrix-vector multiply we use a hand written kernel. - if ( - isinstance(inputs[0].shape[0], (int, tir.IntImm)) - and inputs[0].shape[0] == 1 - and ( - topi.utils.is_dynamic_shape(inputs[0].shape) - or topi.utils.is_dynamic_shape(inputs[1].shape) - ) - ): - strategy.add_implementation( - wrap_compute_dense(topi.x86.dense_dynamic), - wrap_topi_schedule(topi.x86.schedule_dense_dynamic), - name="dense_dynamic.x86", - plevel=20, - ) - return strategy - logger.warning("dense is not optimized for arm cpu.") + return strategy + + # For dynamic matrix-vector multiply we use a hand written kernel. + if ( + isinstance(inputs[0].shape[0], (int, tir.IntImm)) + and inputs[0].shape[0] == 1 + and ( + topi.utils.is_dynamic_shape(inputs[0].shape) + or topi.utils.is_dynamic_shape(inputs[1].shape) + ) + ): + strategy.add_implementation( + wrap_compute_dense(topi.x86.dense_dynamic), + wrap_topi_schedule(topi.x86.schedule_dense_dynamic), + name="dense_dynamic.x86", + plevel=20, + ) + return strategy + + need_auto_scheduler_layout = is_auto_scheduler_enabled() + need_meta_schedule_layout = is_meta_schedule_enabled() + if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_dense( topi.nn.dense, - need_auto_scheduler_layout=is_auto_scheduler_enabled(), - need_meta_schedule_layout=is_meta_schedule_enabled(), + need_auto_scheduler_layout=need_auto_scheduler_layout, + need_meta_schedule_layout=need_meta_schedule_layout, ), - wrap_topi_schedule(topi.generic.schedule_dense), + naive_schedule, name="dense.generic", + plevel=11, ) + + # Fallback to x86 schedules as there is currently no arm_cpu schedule for dense + strategy.add_implementation( + wrap_compute_dense(topi.x86.dense_nopack), + wrap_topi_schedule(topi.x86.schedule_dense_nopack), + name="dense_nopack.x86", + plevel=5, + ) + strategy.add_implementation( + wrap_compute_dense(topi.x86.dense_pack), + wrap_topi_schedule(topi.x86.schedule_dense_pack), + name="dense_pack.x86", + plevel=10, + ) + return strategy diff --git a/tests/python/relay/strategy/test_select_implementation.py b/tests/python/relay/strategy/test_select_implementation.py index eae186524c25..20dfe9670ab3 100644 --- a/tests/python/relay/strategy/test_select_implementation.py +++ b/tests/python/relay/strategy/test_select_implementation.py @@ -16,7 +16,10 @@ # under the License. """ Tests strategy selection for Relay ops """ + import pytest +import numpy as np + import tvm from tvm import relay from tvm import te @@ -149,5 +152,40 @@ def test_int8_depthwise_conv2d(target, expected_impl): assert impl.name == expected_impl +@pytest.mark.parametrize( + "target,expected_valid_impl,expected_impl", + [("llvm -device=arm_cpu", ["dense_pack.x86", "dense_nopack.x86"], "dense_pack.x86")], +) +def test_dense(target, expected_valid_impl, expected_impl): + target = tvm.target.Target(target) + + data_shape = (30, 40) + weight_shape = (30, 40) + dtype = "float32" + + out = relay.nn.dense( + relay.var("data", shape=data_shape, dtype=dtype), + relay.var("weight", shape=weight_shape, dtype=dtype), + out_dtype=dtype, + ) + out = run_infer_type(out) + + with target: + args = [ + out.op, + out.attrs, + [te.placeholder(data_shape, dtype), te.placeholder(weight_shape, dtype)], + out.checked_type, + target, + ] + valid_impl = relay.backend.te_compiler.get_valid_implementations(*args) + selected_impl, _ = relay.backend.te_compiler.select_implementation(*args, use_autotvm=False) + + assert len(valid_impl) == len(expected_valid_impl) + for impl in valid_impl: + assert impl.name in expected_valid_impl + assert selected_impl.name == expected_impl + + if __name__ == "__main__": tvm.testing.main()