From 6a7afebe3fbfacbfccbf4d6ba5f656870ccbbad1 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Sun, 28 Aug 2022 21:52:31 -0700 Subject: [PATCH 1/3] [Hexagon] Tests pylint --- tests/lint/pylint.sh | 17 +- .../contrib/test_hexagon/pytest_util.py | 36 +- .../test_hexagon/test_async_dma_pipeline.py | 510 ++++++++++-------- .../test_hexagon/test_benchmark_maxpool2d.py | 190 +++---- .../test_fixed_point_conversion.py | 13 +- .../test_hexagon/test_fixed_point_multiply.py | 167 +++--- .../contrib/test_hexagon/test_memory_alloc.py | 22 +- .../test_hexagon/test_meta_schedule.py | 138 +++-- .../contrib/test_hexagon/test_parallel_hvx.py | 81 +-- .../test_parallel_hvx_load_vtcm.py | 224 ++++---- .../test_hexagon/test_parallel_scalar.py | 63 ++- .../contrib/test_hexagon/test_sigmoid.py | 40 +- .../test_software_pipeline_async.py | 269 ++++----- .../test_hexagon/test_vtcm_bandwidth.py | 53 +- .../test_wo_qnn_canonicalization.py | 5 +- 15 files changed, 1002 insertions(+), 826 deletions(-) diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh index 6b5415987985..e41dc2bb80b8 100755 --- a/tests/lint/pylint.sh +++ b/tests/lint/pylint.sh @@ -27,20 +27,9 @@ python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc # tests/python/contrib/test_hexagon tests -python3 -m pylint tests/python/contrib/test_hexagon/benchmark_util.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/conftest.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/infrastructure.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_2d_physical_buffers.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_autotvm.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_cache_read_write.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_launcher.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_models.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_run_unit_tests.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_thread_pool.py --rcfile="$(dirname "$0")"/pylintrc -python3 -m pylint tests/python/contrib/test_hexagon/test_usmp.py --rcfile="$(dirname "$0")"/pylintrc +python3 -m pylint tests/python/contrib/test_hexagon/*.py --rcfile="$(dirname "$0")"/pylintrc +python3 -m pylint tests/python/contrib/test_hexagon/conv2d/*.py --rcfile="$(dirname "$0")"/pylintrc + # tests/python/frontend tests python3 -m pylint tests/python/frontend/caffe/test_forward.py --rcfile="$(dirname "$0")"/pylintrc diff --git a/tests/python/contrib/test_hexagon/pytest_util.py b/tests/python/contrib/test_hexagon/pytest_util.py index 77842ce91493..c078edf7a934 100644 --- a/tests/python/contrib/test_hexagon/pytest_util.py +++ b/tests/python/contrib/test_hexagon/pytest_util.py @@ -15,11 +15,11 @@ # specific language governing permissions and limitations # under the License. -import pytest -import numpy as np -from typing import * +""" Hexagon pytest utility functions """ + +from typing import List, Optional, Union import collections -import tvm.testing +import numpy as np def get_test_id(*test_params, test_param_descs: List[Optional[str]] = None) -> str: @@ -47,35 +47,35 @@ def get_test_id(*test_params, test_param_descs: List[Optional[str]] = None) -> s assert len(test_param_descs) == len(test_params) def get_single_param_chunk(param_val, param_desc: Optional[str]): - if type(param_val) == list: + if isinstance(param_val, list): # Like str(list), but avoid the whitespace padding. val_str = "[" + ",".join(str(x) for x in param_val) + "]" need_prefix_separator = False - elif type(param_val) == bool: + elif isinstance(param_val, bool): if param_val: val_str = "T" else: val_str = "F" need_prefix_separator = True - elif type(param_val) == TensorContentConstant: + elif isinstance(param_val, TensorContentConstant): val_str = f"const[{param_val.elem_value}]" need_prefix_separator = True - elif type(param_val) == TensorContentDtypeMin: + elif isinstance(param_val, TensorContentDtypeMin): val_str = "min" need_prefix_separator = True - elif type(param_val) == TensorContentDtypeMax: + elif isinstance(param_val, TensorContentDtypeMax): val_str = "max" need_prefix_separator = True - elif type(param_val) == TensorContentRandom: + elif isinstance(param_val, TensorContentRandom): val_str = "random" need_prefix_separator = True - elif type(param_val) == TensorContentSequentialCOrder: + elif isinstance(param_val, TensorContentSequentialCOrder): val_str = f"seqC[start:{param_val.start_value},inc:{param_val.increment}]" need_prefix_separator = True @@ -148,26 +148,26 @@ def create_populated_numpy_ndarray( """ itp = input_tensor_populator # just for brevity - if type(itp) == TensorContentConstant: + if isinstance(itp, TensorContentConstant): return np.full(tuple(input_shape), itp.elem_value, dtype=dtype) - elif type(itp) == TensorContentDtypeMin: + elif isinstance(itp, TensorContentDtypeMin): info = get_numpy_dtype_info(dtype) return np.full(tuple(input_shape), info.min, dtype=dtype) - elif type(itp) == TensorContentDtypeMax: + elif isinstance(itp, TensorContentDtypeMax): info = get_numpy_dtype_info(dtype) return np.full(tuple(input_shape), info.max, dtype=dtype) - elif type(itp) == TensorContentRandom: + elif isinstance(itp, TensorContentRandom): return np.random.random(input_shape).astype(dtype) - elif type(itp) == TensorContentSequentialCOrder: + elif isinstance(itp, TensorContentSequentialCOrder): a = np.empty(tuple(input_shape), dtype) - with np.nditer(a, op_flags=["writeonly"], order="C") as it: + with np.nditer(a, op_flags=["writeonly"], order="C") as iterator: next_elem_val = itp.start_value - for elem in it: + for elem in iterator: elem[...] = next_elem_val next_elem_val += itp.increment return a diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py index 45e8eb0f68c6..c5830387160b 100644 --- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py +++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py @@ -29,30 +29,35 @@ def conv_approximation(size_a, size_w): + """Conv approximation.""" a_shape = (size_a, VRMPY_SIZE_B) w_shape = (size_w, VRMPY_SIZE_B) out_shape = (size_a, VRMPY_SIZE_INT32) @T.prim_func - def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, a_shape, dtype="uint8") - W = T.match_buffer(b, w_shape, dtype="uint8") - C = T.match_buffer(c, out_shape, dtype="int32") - for n, i in T.grid(size_a, size_w): - with T.block("C"): - vn, vi = T.axis.remap("SR", [n, i]) - T.reads(A[vn, 0:VRMPY_SIZE_B], W[vi, 0:VRMPY_SIZE_B], C[vn, 0:VRMPY_SIZE_INT32]) - T.writes(C[vn, 0:VRMPY_SIZE_INT32]) + a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8") + w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8") + c_buffer = T.match_buffer(c_output, out_shape, dtype="int32") + for n, index_0 in T.grid(size_a, size_w): + with T.block("c_buffer"): + vn_index, vi_index = T.axis.remap("SR", [n, index_0]) + T.reads( + a_buffer[vn_index, 0:VRMPY_SIZE_B], + w_buffer[vi_index, 0:VRMPY_SIZE_B], + c_buffer[vn_index, 0:VRMPY_SIZE_INT32], + ) + T.writes(c_buffer[vn_index, 0:VRMPY_SIZE_INT32]) with T.init(): for x in T.serial(VRMPY_SIZE_INT32): - C[vn, x] = 0 - C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin( + c_buffer[vn_index, x] = 0 + c_buffer[vn_index, T.ramp(0, 1, 32)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"), T.uint32(3), - C[vn, T.ramp(0, 1, 32)], - T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), - T.reinterpret(W[vi, T.ramp(0, 1, 128)], dtype="int32x32"), + c_buffer[vn_index, T.ramp(0, 1, 32)], + T.reinterpret(a_buffer[vn_index, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(w_buffer[vi_index, T.ramp(0, 1, 128)], dtype="int32x32"), dtype="int32x32", ) # Currently async DMA lowering does not add any wait to the end of schedules so @@ -73,13 +78,14 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: def evaluate( hexagon_session, sch, - a, - b, - c, + a_data, + b_data, + c_data, expected_output=None, use_async_copy=0, merge_async_commit_queue_scope=False, ): + """Evaluate function.""" target_hexagon = tvm.target.hexagon("v68", link_params=True) with tvm.transform.PassContext( config={ @@ -92,9 +98,9 @@ def evaluate( ) module = hexagon_session.load_module(func_tir) - a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) - b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) - c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + a_hexagon = tvm.runtime.ndarray.array(a_data, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(b_data, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c_data, device=hexagon_session.device) if tvm.testing.utils.IS_IN_CI: # Run with reduced number and repeat for CI @@ -108,32 +114,8 @@ def evaluate( return round(time.mean * 1000, 4) -@tvm.testing.fixture -def input_a(size_a): - return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8") - - -@tvm.testing.fixture -def input_w(size_w): - return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8") - - -@tvm.testing.fixture -def expected_output(size_a, size_w, input_a, input_w): - if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1): - pytest.skip("Skipping test since it takes too long in CI.") - expected_output = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32") - for n in range(size_a): - for x in range(size_w): - for i in range(VRMPY_SIZE_INT32): - for r in range(4): - expected_output[n, i] += np.uint32(input_a[n, i * 4 + r]) * np.uint32( - input_w[x, i * 4 + r] - ) - return expected_output - - def get_single_dma_schedule(size_a, size_w): + """Generate single DMA schedule.""" a_shape = (size_a, VRMPY_SIZE_B) w_shape = (size_w, VRMPY_SIZE_B) out_shape = (size_a, VRMPY_SIZE_INT32) @@ -142,32 +124,32 @@ def get_single_dma_schedule(size_a, size_w): w_bytes = size_w * VRMPY_SIZE_B @T.prim_func - def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, a_shape, dtype="uint8", mem_scope="global") - W = T.match_buffer(b, w_shape, dtype="uint8", mem_scope="global") - C = T.match_buffer(c, out_shape, dtype="int32", mem_scope="global") - A_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm") - W_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm") - C_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm") + a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8", mem_scope="global") + w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8", mem_scope="global") + c_buffer = T.match_buffer(c_output, out_shape, dtype="int32", mem_scope="global") + a_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm") + w_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm") + c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm") T.evaluate( T.tvm_call_packed( "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( - A_global_vtcm.data, + a_global_vtcm.data, T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), 0, 2, - A_global_vtcm.dtype, + a_global_vtcm.dtype, 0, dtype="handle", ), T.tvm_stack_make_array( - A.data, + a_buffer.data, T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), 0, 2, - A.dtype, + a_buffer.dtype, 0, dtype="handle", ), @@ -179,20 +161,20 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.tvm_call_packed( "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( - W_global_vtcm.data, + w_global_vtcm.data, T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"), 0, 2, - W_global_vtcm.dtype, + w_global_vtcm.dtype, 0, dtype="handle", ), T.tvm_stack_make_array( - W.data, + w_buffer.data, T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"), 0, 2, - W.dtype, + w_buffer.dtype, 0, dtype="handle", ), @@ -200,43 +182,43 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: dtype="int32", ) ) - for n, i in T.grid(size_a, size_w): - with T.block("C"): - vn, vi = T.axis.remap("SR", [n, i]) + for n, index_0 in T.grid(size_a, size_w): + with T.block("c_buffer"): + vn_index, vi_index = T.axis.remap("SR", [n, index_0]) T.reads( - A_global_vtcm[vn, 0:VRMPY_SIZE_B], - W_global_vtcm[vi, 0:VRMPY_SIZE_B], - C_global_vtcm[vn, 0:VRMPY_SIZE_INT32], + a_global_vtcm[vn_index, 0:VRMPY_SIZE_B], + w_global_vtcm[vi_index, 0:VRMPY_SIZE_B], + c_global_vtcm[vn_index, 0:VRMPY_SIZE_INT32], ) - T.writes(C_global_vtcm[vn, 0:VRMPY_SIZE_INT32]) + T.writes(c_global_vtcm[vn_index, 0:VRMPY_SIZE_INT32]) with T.init(): for x in T.serial(VRMPY_SIZE_INT32): - C_global_vtcm[vn, x] = 0 - C_global_vtcm[vn, T.ramp(0, 1, 32)] += T.call_llvm_intrin( + c_global_vtcm[vn_index, x] = 0 + c_global_vtcm[vn_index, T.ramp(0, 1, 32)] += T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), T.uint32(2), - T.reinterpret(A_global_vtcm[vn, T.ramp(0, 1, 128)], dtype="int32x32"), - T.reinterpret(W_global_vtcm[vi, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(a_global_vtcm[vn_index, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(w_global_vtcm[vi_index, T.ramp(0, 1, 128)], dtype="int32x32"), dtype="int32x32", ) T.evaluate( T.tvm_call_packed( "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( - C.data, + c_buffer.data, T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), 0, 2, - C.dtype, + c_buffer.dtype, 0, dtype="handle", ), T.tvm_stack_make_array( - C_global_vtcm.data, + c_global_vtcm.data, T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), 0, 2, - C_global_vtcm.dtype, + c_global_vtcm.dtype, 0, dtype="handle", ), @@ -251,43 +233,45 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2): + """Generate fake conv schedule with VTCM.""" sch = conv_approximation(size_a, size_w) - compute_block = sch.get_block("C") + compute_block = sch.get_block("c_buffer") sch.cache_read(compute_block, 1, "global.vtcm") n = sch.get_loops(compute_block)[0] - no, _ = sch.split(n, [blocks, None]) + n_outer, _ = sch.split(n, [blocks, None]) cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm") - sch.compute_at(cache_read_block_a, no) + sch.compute_at(cache_read_block_a, n_outer) sch.fuse(*sch.get_loops(cache_read_block_a)[1:]) cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm") - sch.reverse_compute_at(cache_write_block_c, no) + sch.reverse_compute_at(cache_write_block_c, n_outer) sch.fuse(*sch.get_loops(cache_write_block_c)[1:]) return sch def get_multi_input_fake_conv_vtcm_schedule(size_a, size_w, blocks=2): + """Generate multi input fake Conv using VTCM.""" sch = conv_approximation(size_a, size_w) - compute_block = sch.get_block("C") + compute_block = sch.get_block("c_buffer") n = sch.get_loops(compute_block)[0] - no, _ = sch.split(n, [blocks, None]) + n_outer, _ = sch.split(n, [blocks, None]) cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm") - sch.compute_at(cache_read_block_a, no) + sch.compute_at(cache_read_block_a, n_outer) sch.fuse(*sch.get_loops(cache_read_block_a)[1:]) cache_read_block_b = sch.cache_read(compute_block, 1, "global.vtcm") - sch.compute_at(cache_read_block_b, no) + sch.compute_at(cache_read_block_b, n_outer) sch.fuse(*sch.get_loops(cache_read_block_b)[1:]) cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm") - sch.reverse_compute_at(cache_write_block_c, no) + sch.reverse_compute_at(cache_write_block_c, n_outer) sch.fuse(*sch.get_loops(cache_write_block_c)[1:]) return sch @@ -301,6 +285,8 @@ def print_results(test_key, runtimes): class TestAsyncDMAPipeline: + """Async DMA pipeline test class.""" + # Removed most of these to speedup CI. size_a = tvm.testing.parameter( 1024, @@ -314,6 +300,29 @@ class TestAsyncDMAPipeline: 9 * 9, ) + @tvm.testing.fixture + def input_a(self, size_a): + return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8") + + @tvm.testing.fixture + def input_w(self, size_w): + return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8") + + @tvm.testing.fixture + def expected_output(self, size_a, size_w, input_a, input_w): + """Generate expected output.""" + if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1): + pytest.skip("Skipping test since it takes too long in CI.") + expected_result = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32") + for n in range(size_a): + for x in range(size_w): + for index_0 in range(VRMPY_SIZE_INT32): + for r_index in range(4): + expected_result[n, index_0] += np.uint32( + input_a[n, index_0 * 4 + r_index] + ) * np.uint32(input_w[x, index_0 * 4 + r_index]) + return expected_result + @tvm.testing.requires_hexagon def test_loading_vtcm_for_vrmpy( self, @@ -324,6 +333,7 @@ def test_loading_vtcm_for_vrmpy( input_w, expected_output, ): + """VTCM for VRMPY test.""" if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1): pytest.skip("Skipping test since it takes too long in CI.") @@ -350,7 +360,7 @@ def test_loading_vtcm_for_vrmpy( ) sch = get_fake_conv_vtcm_schedule(size_a, size_w) - n = sch.get_loops(sch.get_block("C"))[0] + n = sch.get_loops(sch.get_block("c_buffer"))[0] sch.annotate(n, "software_pipeline_stage", [0, 1, 2]) sch.annotate(n, "software_pipeline_order", [0, 1, 2]) sch.annotate(n, "software_pipeline_async_stages", [0]) @@ -365,11 +375,11 @@ def test_loading_vtcm_for_vrmpy( ) sch = get_fake_conv_vtcm_schedule(size_a, size_w) - n = sch.get_loops(sch.get_block("C"))[0] + n = sch.get_loops(sch.get_block("c_buffer"))[0] sch.annotate(n, "software_pipeline_stage", [0, 1, 2]) sch.annotate(n, "software_pipeline_order", [0, 1, 2]) sch.annotate(n, "software_pipeline_async_stages", [0, 2]) - async_input_output_runtime = evaluate( + async_input_output = evaluate( hexagon_session, sch, input_a, @@ -380,11 +390,11 @@ def test_loading_vtcm_for_vrmpy( ) sch = get_fake_conv_vtcm_schedule(size_a, size_w) - n = sch.get_loops(sch.get_block("C"))[0] + n = sch.get_loops(sch.get_block("c_buffer"))[0] sch.annotate(n, "software_pipeline_stage", [0, 3, 6]) sch.annotate(n, "software_pipeline_order", [0, 1, 2]) sch.annotate(n, "software_pipeline_async_stages", [0, 6]) - async_input_output_runtime_larger_buffers = evaluate( + async_larger_buffers = evaluate( hexagon_session, sch, input_a, @@ -395,11 +405,11 @@ def test_loading_vtcm_for_vrmpy( ) sch = get_multi_input_fake_conv_vtcm_schedule(size_a, size_w) - n = sch.get_loops(sch.get_block("C"))[0] + n = sch.get_loops(sch.get_block("c_buffer"))[0] sch.annotate(n, "software_pipeline_stage", [0, 0, 1, 2]) sch.annotate(n, "software_pipeline_order", [0, 1, 2, 3]) sch.annotate(n, "software_pipeline_async_stages", [0, 2]) - async_multi_input_output_runtime = evaluate( + async_multi_input_output = evaluate( hexagon_session, sch, input_a, @@ -411,7 +421,7 @@ def test_loading_vtcm_for_vrmpy( ) sch = get_fake_conv_vtcm_schedule(size_a, size_w) - n = sch.get_loops(sch.get_block("C"))[0] + n = sch.get_loops(sch.get_block("c_buffer"))[0] sch.annotate(n, "software_pipeline_stage", [0, 1, 2]) sch.annotate(n, "software_pipeline_order", [0, 1, 2]) sch.annotate(n, "software_pipeline_async_stages", [2]) @@ -435,22 +445,30 @@ def test_loading_vtcm_for_vrmpy( expected_output, ) - # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128 + # Total transfer size is equal to the size of + # a_buffer + w_buffer + c_buffer which is equal to 2 * size_a * 128 + size_w * 128 transfer_mb = round((2 * size_a * VRMPY_SIZE_B + size_w * VRMPY_SIZE_B) / 1e6, 2) - # Total number of operations can be calculated given the total number of vrmpy calls (size_a * size_w) * operations per vrmpy accumulate (128 multiplies + 3 adds for reduction per lane + 1 add for accumulate per lane) + # Total number of operations can be calculated given + # the total number of vrmpy calls (size_a * size_w) * operations + # per vrmpy accumulate (128 multiplies + 3 adds for reduction + # per lane + 1 add for accumulate per lane) complexity = round(size_a * size_w * (VRMPY_SIZE_B * 4) / 1e9, 3) print_results( - f"Test with A.size: {size_a * VRMPY_SIZE_B}, W.size: {size_w * VRMPY_SIZE_B}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...", + ( + f"Test with a_buffer.size: {size_a * VRMPY_SIZE_B}, w_buffer.size:" + f" {size_w * VRMPY_SIZE_B}, computational complexity of {complexity} GOPs" + f", and total memory transfer of {transfer_mb} MB..." + ), { "without_vtcm": base_runtime, "synchronous_dma": single_dma_runtime, "base_vtcm": base_vtcm_runtime, "async_dma_input": async_input_runtime, "async_dma_output": async_output_runtime, - "async_dma_input_output": async_input_output_runtime, - "async_dma_multi_input_output": async_multi_input_output_runtime, - "async_input_output_runtime_larger_buffers": async_input_output_runtime_larger_buffers, + "async_dma_input_output": async_input_output, + "async_dma_multi_input_output": async_multi_input_output, + "async_input_output_runtime_larger_buffers": async_larger_buffers, }, ) @@ -458,84 +476,102 @@ def test_loading_vtcm_for_vrmpy( # from tvm.script import tir as T @tvm.script.ir_module class ModulePipelined: + """Pipelined module class.""" + + # pylint: disable=no-self-argument @T.prim_func def main( - p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], - p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], - T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"], + p0_buffer: T.Buffer[(1, 1, 230, 230, 4), "uint8"], + p1_buffer: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], + t_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"], ) -> None: + # pylint: disable=missing-function-docstring # function attr dict T.func_attr({"tir.noalias": True, "global_symbol": "main"}) # body # with T.block("root") - conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32", scope="global.vtcm") + conv2d_nchwc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32", scope="global.vtcm") p0_global_vtcm = T.alloc_buffer([1, 1, 230, 230, 4], dtype="uint8", scope="global.vtcm") p1_global_vtcm = T.alloc_buffer([2, 1, 7, 7, 1, 32, 4], dtype="int8", scope="global.vtcm") for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(2, 1, 7, 7, 1, 32, 4): with T.block("p1_global.vtcm"): - v0, v1, v2, v3, v4, v5, v6 = T.axis.remap( + v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind = T.axis.remap( "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6] ) - T.reads(p1[v0, v1, v2, v3, v4, v5, v6]) - T.writes(p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6]) - p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = p1[v0, v1, v2, v3, v4, v5, v6] - for po in T.serial(4): - for i in T.serial(55876): + T.reads(p1_buffer[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind]) + T.writes(p1_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind]) + p1_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind] = p1_buffer[ + v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind + ] + for p_outer in T.serial(4): + for index_0 in T.serial(55876): with T.block("p0_global.vtcm"): - v0 = T.axis.spatial(1, 0) - v1 = T.axis.spatial(1, 0) - v2 = T.axis.spatial(230, po * 56 + i // 916) - v3 = T.axis.spatial(230, i % 916 // 4) - v4 = T.axis.spatial(4, i % 4) - T.reads(p0[v0, v1, v2, v3, v4]) - T.writes(p0_global_vtcm[v0, v1, v2, v3, v4]) - p0_global_vtcm[v0, v1, v2, v3, v4] = p0[v0, v1, v2, v3, v4] - for i in T.parallel(28): - for ii, iii, iiii in T.grid(2, 14, 8): + v0_ind = T.axis.spatial(1, 0) + v1_ind = T.axis.spatial(1, 0) + v2_ind = T.axis.spatial(230, p_outer * 56 + index_0 // 916) + v3_ind = T.axis.spatial(230, index_0 % 916 // 4) + v4_ind = T.axis.spatial(4, index_0 % 4) + T.reads(p0_buffer[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind]) + T.writes(p0_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind]) + p0_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind] = p0_buffer[ + v0_ind, v1_ind, v2_ind, v3_ind, v4_ind + ] + for index_0 in T.parallel(28): + for index_1, index_2, index_3 in T.grid(2, 14, 8): with T.block("conv2d_NCHWc_int8_o_init"): n = T.axis.spatial(1, 0) - oc_chunk = T.axis.spatial(2, ii) - oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + iii) - ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + iiii) - oc_block_o = T.axis.spatial(1, 0) + oc_chunk = T.axis.spatial(2, index_1) + o_height = T.axis.spatial( + 112, (p_outer * 28 + index_0) // 14 * 14 + index_2 + ) + o_width = T.axis.spatial(112, (p_outer * 28 + index_0) % 14 * 8 + index_3) + oc_block_o = T.axis.spatial(1, 0) # pylint: disable=unused-variable T.reads() - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32]) + T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32]) for i4_1 in T.vectorized(32): with T.block("conv2d_NCHWc_int8_init"): oc_block_i_init = T.axis.spatial(32, i4_1) T.reads() - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init]) - conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0 + T.writes( + conv2d_nchwc_int8[ + n, oc_chunk, o_height, o_width, oc_block_i_init + ] + ) + conv2d_nchwc_int8[ + n, oc_chunk, o_height, o_width, oc_block_i_init + ] = 0 for i1_1, i5_1, i6_1, i2_2, i3_2 in T.grid(2, 7, 7, 14, 8): with T.block("conv2d_NCHWc_int8_o_update"): n = T.axis.spatial(1, 0) oc_chunk = T.axis.spatial(2, i1_1) - oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + i2_2) - ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + i3_2) - oc_block_o = T.axis.spatial(1, 0) - kh = T.axis.reduce(7, i5_1) - kw = T.axis.reduce(7, i6_1) + o_height = T.axis.spatial(112, (p_outer * 28 + index_0) // 14 * 14 + i2_2) + o_width = T.axis.spatial(112, (p_outer * 28 + index_0) % 14 * 8 + i3_2) + oc_block_o = T.axis.spatial(1, 0) # pylint: disable=unused-variable + k_height = T.axis.reduce(7, i5_1) + k_width = T.axis.reduce(7, i6_1) ic_outer = T.axis.reduce(1, 0) ic_f_inner = T.axis.reduce(1, 0) - ic_s_inner_o = T.axis.reduce(1, 0) + ic_s_inner_o = T.axis.reduce(1, 0) # pylint: disable=unused-variable T.reads( - conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32], + conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32], p0_global_vtcm[ n, ic_outer, - oh * 2 + kh, - ow * 2 + kw, + o_height * 2 + k_height, + o_width * 2 + k_width, ic_f_inner * 4 : ic_f_inner * 4 + 4, ], - p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4], + p1_global_vtcm[ + oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4 + ], ) - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32]) - A = T.match_buffer( + T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32]) + a_buffer = T.match_buffer( p0_global_vtcm[ n, ic_outer, - oh * 2 + kh, - ow * 2 + kw, + o_height * 2 + k_height, + o_width * 2 + k_width, ic_f_inner * 4 : ic_f_inner * 4 + 4, ], [4], @@ -543,42 +579,48 @@ def main( offset_factor=1, scope="global.vtcm", ) - B = T.match_buffer( - p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4], + b_buffer = T.match_buffer( + p1_global_vtcm[ + oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4 + ], [32, 4], dtype="int8", offset_factor=1, scope="global.vtcm", ) - C = T.match_buffer( - conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32], + c_buffer = T.match_buffer( + conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32], [32], dtype="int32", offset_factor=1, scope="global.vtcm", ) - A_u8x4: T.uint8x4 = A[0:4] - A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32") - B_i8x128 = B[0, 0:128] - B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32") - C[0:32] = T.call_llvm_pure_intrin( + a_u8x4: T.uint8x4 = a_buffer[0:4] + a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32") + b_i8x128 = b_buffer[0, 0:128] + b_i32x32: T.int32x32 = T.reinterpret(b_i8x128, dtype="int32x32") + c_buffer[0:32] = T.call_llvm_pure_intrin( 4217, T.uint32(3), - C[0:32], - T.broadcast(A_i32, 32), - B_i32x32, + c_buffer[0:32], + T.broadcast(a_i32, 32), + b_i32x32, dtype="int32x32", ) - for i in T.serial(200704): - with T.block("conv2d_NCHWc_int8.vtcm"): + for index_0 in T.serial(200704): + with T.block("conv2d_nchwc_int8.vtcm"): ax0_1 = T.axis.spatial(1, 0) - ax1_1 = T.axis.spatial(2, i % 7168 // 3584) - ax2_1 = T.axis.spatial(112, (po * 28 + i // 7168) // 14 * 14 + i % 3584 // 256) - ax3_1 = T.axis.spatial(112, (po * 28 + i // 7168) % 14 * 8 + i % 256 // 32) - ax4 = T.axis.spatial(32, i % 32) - T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) - T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) - T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ + ax1_1 = T.axis.spatial(2, index_0 % 7168 // 3584) + ax2_1 = T.axis.spatial( + 112, (p_outer * 28 + index_0 // 7168) // 14 * 14 + index_0 % 3584 // 256 + ) + ax3_1 = T.axis.spatial( + 112, (p_outer * 28 + index_0 // 7168) % 14 * 8 + index_0 % 256 // 32 + ) + ax4 = T.axis.spatial(32, index_0 % 32) + T.reads(conv2d_nchwc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) + T.writes(t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) + t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_nchwc_int8[ ax0_1, ax1_1, ax2_1, ax3_1, ax4 ] @@ -586,122 +628,138 @@ def main( # from tvm.script import tir as T @tvm.script.ir_module class ModuleBase: + """Base module test class.""" + + # pylint: disable=no-self-argument @T.prim_func def main( - p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], - p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], - T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"], + p0_buffer: T.Buffer[(1, 1, 230, 230, 4), "uint8"], + p1_buffer: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], + t_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"], ) -> None: + # pylint: disable=missing-function-docstring # function attr dict T.func_attr({"tir.noalias": True, "global_symbol": "main"}) # buffer definition # body # with T.block("root") - conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32") + conv2d_nchwc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32") for i0_0_i1_0_i2_0_i3_0_fused in T.parallel( 112, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1} ): - for i4_0_0 in T.serial(1): + for i4_0_0 in T.serial(1): # pylint: disable=unused-variable for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid( 2, 1, 1, 1, 14, 8 ): with T.block("conv2d_NCHWc_int8_o_init"): n = T.axis.spatial(1, 0) oc_chunk = T.axis.spatial(2, i1_1_init + i1_2_init) - oh = T.axis.spatial( + o_height = T.axis.spatial( 112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init ) - ow = T.axis.spatial( + o_width = T.axis.spatial( 112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init ) - oc_block_o = T.axis.spatial(1, 0) + oc_block_o = T.axis.spatial(1, 0) # pylint: disable=unused-variable T.reads() - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32]) + T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32]) for i4_1 in T.vectorized(32): with T.block("conv2d_NCHWc_int8_init"): oc_block_i_init = T.axis.spatial(32, i4_1) T.reads() - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init]) - conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0 - for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(1, 1, 1, 1, 1): + T.writes( + conv2d_nchwc_int8[ + n, oc_chunk, o_height, o_width, oc_block_i_init + ] + ) + conv2d_nchwc_int8[ + n, oc_chunk, o_height, o_width, oc_block_i_init + ] = 0 + for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid( # pylint: disable=unused-variable + 1, 1, 1, 1, 1 + ): # pylint: disable=unused-variable for ( - i0_1, + i0_1, # pylint: disable=unused-variable i1_1, i2_1, i3_1, - i4_0_1, + i4_0_1, # pylint: disable=unused-variable i5_1, i6_1, - i7_1, - i8_1, - i9_0_1, - i0_2, + i7_1, # pylint: disable=unused-variable + i8_1, # pylint: disable=unused-variable + i9_0_1, # pylint: disable=unused-variable + i0_2, # pylint: disable=unused-variable i1_2, i2_2, i3_2, - i4_0_2, + i4_0_2, # pylint: disable=unused-variable ) in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1): with T.block("conv2d_NCHWc_int8_o_update"): n = T.axis.spatial(1, 0) oc_chunk = T.axis.spatial(2, i1_1 + i1_2) - oh = T.axis.spatial( + o_height = T.axis.spatial( 112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2 ) - ow = T.axis.spatial( + o_width = T.axis.spatial( 112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2 ) - oc_block_o = T.axis.spatial(1, 0) - kh = T.axis.reduce(7, i5_0 * 7 + i5_1) - kw = T.axis.reduce(7, i6_0 * 7 + i6_1) + oc_block_o = T.axis.spatial(1, 0) # pylint: disable=unused-variable + k_height = T.axis.reduce(7, i5_0 * 7 + i5_1) + k_width = T.axis.reduce(7, i6_0 * 7 + i6_1) ic_outer = T.axis.reduce(1, 0) ic_f_inner = T.axis.reduce(1, 0) - ic_s_inner_o = T.axis.reduce(1, 0) + ic_s_inner_o = T.axis.reduce(1, 0) # pylint: disable=unused-variable T.reads( - conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32], - p0[ + conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32], + p0_buffer[ n, ic_outer, - oh * 2 + kh, - ow * 2 + kw, + o_height * 2 + k_height, + o_width * 2 + k_width, ic_f_inner * 4 : ic_f_inner * 4 + 4, ], - p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4], + p1_buffer[ + oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4 + ], ) - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32]) - A = T.match_buffer( - p0[ + T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32]) + a_buffer = T.match_buffer( + p0_buffer[ n, ic_outer, - oh * 2 + kh, - ow * 2 + kw, + o_height * 2 + k_height, + o_width * 2 + k_width, ic_f_inner * 4 : ic_f_inner * 4 + 4, ], [4], dtype="uint8", offset_factor=1, ) - B = T.match_buffer( - p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4], + b_buffer = T.match_buffer( + p1_buffer[ + oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4 + ], [32, 4], dtype="int8", offset_factor=1, ) - C = T.match_buffer( - conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32], + c_buffer = T.match_buffer( + conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32], [32], dtype="int32", offset_factor=1, ) - A_u8x4: T.uint8x4 = A[0:4] - A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32") - B_i8x128 = B[0, 0:128] - B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32") - C[0:32] = T.call_llvm_pure_intrin( + a_u8x4: T.uint8x4 = a_buffer[0:4] + a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32") + b_i8x128 = b_buffer[0, 0:128] + b_i32x32: T.int32x32 = T.reinterpret(b_i8x128, dtype="int32x32") + c_buffer[0:32] = T.call_llvm_pure_intrin( 4217, T.uint32(3), - C[0:32], - T.broadcast(A_i32, 32), - B_i32x32, + c_buffer[0:32], + T.broadcast(a_i32, 32), + b_i32x32, dtype="int32x32", ) for ax0, ax1, ax2, ax3 in T.grid(1, 2, 14, 8): @@ -715,44 +773,50 @@ def main( 112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3 ) ax4 = T.axis.spatial(32, ax4_fused) - T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) - T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) - T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ + T.reads(conv2d_nchwc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) + T.writes(t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) + t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_nchwc_int8[ ax0_1, ax1_1, ax2_1, ax3_1, ax4 ] @tvm.testing.requires_hexagon def test_meta(hexagon_session): + """Test meta.""" if tvm.testing.utils.IS_IN_CI: pytest.skip("Skipping test since it takes too long in CI.") - a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8") - w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8") - c = np.zeros((1, 2, 112, 112, 32), dtype="int32") + a_data = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8") + w_data = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8") + c_data = np.zeros((1, 2, 112, 112, 32), dtype="int32") sch = tvm.tir.Schedule(ModuleBase) - base_runtime = evaluate(hexagon_session, sch, a, w, c) + base_runtime = evaluate(hexagon_session, sch, a_data, w_data, c_data) sch = tvm.tir.Schedule(ModulePipelined) compute_block = sch.get_block("conv2d_NCHWc_int8_o_update") - o = sch.get_loops(compute_block)[0] + outer = sch.get_loops(compute_block)[0] - unscheduled_vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1) + unscheduled_vtcm_runtime = evaluate( + hexagon_session, sch, a_data, w_data, c_data, use_async_copy=1 + ) sch = tvm.tir.Schedule(ModulePipelined) compute_block = sch.get_block("conv2d_NCHWc_int8_o_update") - o = sch.get_loops(compute_block)[0] + outer = sch.get_loops(compute_block)[0] - sch.annotate(o, "software_pipeline_stage", [0, 1, 2]) - sch.annotate(o, "software_pipeline_order", [0, 1, 2]) - sch.annotate(o, "software_pipeline_async_stages", [0, 2]) + sch.annotate(outer, "software_pipeline_stage", [0, 1, 2]) + sch.annotate(outer, "software_pipeline_order", [0, 1, 2]) + sch.annotate(outer, "software_pipeline_async_stages", [0, 2]) - pipeline_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1) + pipeline_runtime = evaluate(hexagon_session, sch, a_data, w_data, c_data, use_async_copy=1) - transfer_mb = round((a.size + w.size + c.size) / 1e6, 2) + transfer_mb = round((a_data.size + w_data.size + c_data.size) / 1e6, 2) print_results( - f"Test with A.size: {a.size}, W.size: {w.size}, and total memory transfer of {transfer_mb} MB...", + ( + f"Test with a_buffer.size: {a_data.size}, w_buffer.size: {w_data.size}" + f", and total memory transfer of {transfer_mb} MB..." + ), { "without_vtcm": base_runtime, "unscheduled_vtcm_runtime": unscheduled_vtcm_runtime, diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py index 7e8a6d79f492..24d1a3f788cf 100644 --- a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py +++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py @@ -28,7 +28,7 @@ - Testing parameters (input shapes, dtypes, etc.) currently support only one value for each parameter. - - H, W, C must be integer multiples of 8, 8, and 32, + - height, width, channel must be integer multiples of 8, 8, and 32, respectively. I.e., partial blocks aren't currently supported by this script. @@ -42,23 +42,25 @@ primfuncs and demonstrate more coding strategies. """ -import pytest -import numpy as np +from typing import List import copy import os +import pytest +import numpy as np + import tvm.testing from tvm import te, topi, tir from tvm.topi import testing from tvm.contrib.hexagon.session import Session -from typing import List + from .infrastructure import allocate_hexagon_array, get_hexagon_target from . import benchmark_util as bu # Pytest seems to require that fixture names exist in the current module. # E.g., it doesn't allow: @pytest.mark.usefixtures("bu.benchmark_group") -benchmark_group = bu.benchmark_group +BENCHMARK_GROUP = bu.benchmark_group _SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_benchmarks_flag_and_reason() @@ -67,25 +69,25 @@ def _ceil_div(numerator, denominator): return (numerator + (denominator - 1)) // denominator -def _int8_nhwc_8h8w32c_map(n, h, w, c): +def _int8_nhwc_8h8w32c_map(n_batch, height, width, channel): return [ - n, - h // 8, - w // 8, - c // 32, + n_batch, + height // 8, + width // 8, + channel // 32, te.AXIS_SEPARATOR, - h % 8, - w % 8, - c % 32, + height % 8, + width % 8, + channel % 32, ] -def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]: +def _int8_nhwc_8h8w32c_shape(n_batch, height, width, channel) -> List[int]: return [ - n, - _ceil_div(h, 8), - _ceil_div(w, 8), - _ceil_div(c, 32), + n_batch, + _ceil_div(height, 8), + _ceil_div(width, 8), + _ceil_div(channel, 32), 8, 8, 32, @@ -100,10 +102,10 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray: stage1 = copy.copy(arr_in) ( - n, - h, - w, - c, + n_batch, + height, + width, + channel, ) = stage1.shape ( @@ -112,9 +114,9 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray: c_minor, ) = [8, 8, 32] - h_major = _ceil_div(h, h_minor) - w_major = _ceil_div(w, w_minor) - c_major = _ceil_div(c, c_minor) + h_major = _ceil_div(height, h_minor) + w_major = _ceil_div(width, w_minor) + c_major = _ceil_div(channel, c_minor) # This handles cases where the dimensions of arr_in are not cleanly divided # by the minor block size, i.e. [8, 8, 32]. @@ -122,10 +124,12 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray: # Any additional array elements that this creates will ahve value 0. # We shouldn't actually care what value is used for those elements, because they # shouldn't be treated as meaningful by any of our algorithms. - if (h % h_minor) or (w % w_minor) or (c % c_minor): - stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False) + if (height % h_minor) or (width % w_minor) or (channel % c_minor): + stage1.resize( + (n_batch, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False + ) - stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor) + stage2 = stage1.reshape(n_batch, h_major, h_minor, w_major, w_minor, c_major, c_minor) stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6) return stage3 @@ -137,8 +141,10 @@ def _create_test_input(shape, dtype: str) -> np.ndarray: return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8) -@pytest.mark.usefixtures("benchmark_group") +@pytest.mark.usefixtures("BENCHMARK_GROUP") class TestMaxPool2D: + """maxpool2D base test class""" + csv_column_order = [ # Identifies which TE-compute / TIRScript is used as the basis for the # benchmarked primfunc. Only needs to be meaningful to humans. @@ -150,12 +156,12 @@ class TestMaxPool2D: # Values directly based on test parameters... "input_shape_4d", "block_shape", - "DTYPE", - "KERNEL", - "STRIDE", - "DILATION", - "PADDING", - "IO_TENSOR_MEM_SCOPE", + "dtype", + "kernel", + "stride", + "dilation", + "padding", + "io_tensor_mem_scope", # Reserved columns defined by the BenchmarksTable class. "row_status", "timings_min_usecs", @@ -170,48 +176,50 @@ class TestMaxPool2D: "comments", ] - DTYPE = tvm.testing.parameter("int8") + dtype = tvm.testing.parameter("int8") - # FIXME(cconvey): The script currently fails when H, W, or C is not an + # FIXME(cconvey): The script currently fails when height, width, or channel is not an # integer multiple of 8, 8, or 32, respectively. - N = tvm.testing.parameter(1) - H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]]) - W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]]) - C = tvm.testing.parameter(*[x * 32 for x in [1, 2]]) + n_batch = tvm.testing.parameter(1) + height = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]]) + width = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]]) + channel = tvm.testing.parameter(*[x * 32 for x in [1, 2]]) - KERNEL = tvm.testing.parameter((1, 1), (3, 3)) - STRIDE = tvm.testing.parameter((1, 1)) - DILATION = tvm.testing.parameter((1, 1)) - PADDING = tvm.testing.parameter((0, 0, 0, 0)) - IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm") + kernel = tvm.testing.parameter((1, 1), (3, 3)) + stride = tvm.testing.parameter((1, 1)) + dilation = tvm.testing.parameter((1, 1)) + padding = tvm.testing.parameter((0, 0, 0, 0)) + io_tensor_mem_scope = tvm.testing.parameter("global.vtcm") @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON) @tvm.testing.requires_hexagon def test_maxpool2d_nhwc( self, - N, - H, - W, - C, - DTYPE, - KERNEL, - STRIDE, - DILATION, - PADDING, - IO_TENSOR_MEM_SCOPE, + n_batch, + height, + width, + channel, + dtype, + kernel, + stride, + dilation, + padding, + io_tensor_mem_scope, hexagon_session: Session, ): + """Test maxpool2d NHWC""" + keys_dict = { "basic_kernel": "max_pool2d", "sched_type": 1, - "input_shape_4d": [N, H, W, C], + "input_shape_4d": [n_batch, height, width, channel], "block_shape": [8, 8, 32], - "DTYPE": DTYPE, - "KERNEL": KERNEL, - "STRIDE": STRIDE, - "DILATION": DILATION, - "PADDING": PADDING, - "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE, + "dtype": dtype, + "kernel": kernel, + "stride": stride, + "dilation": dilation, + "padding": padding, + "io_tensor_mem_scope": io_tensor_mem_scope, } desc = bu.get_benchmark_decription(keys_dict) @@ -229,13 +237,13 @@ def test_maxpool2d_nhwc( log_file.write(f"CONFIGURATION: {desc}\n") try: - input_tensor_shape_4d = [N, H, W, C] - input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C) + input_tensor_shape_4d = [n_batch, height, width, channel] + input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(n_batch, height, width, channel) - data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE) + data = te.placeholder(tuple(input_tensor_shape_4d), dtype=dtype) output = topi.nn.pool2d( - data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC" + data, kernel, stride, dilation, padding, "max", layout="NHWC" ) primfunc = te.create_prim_func([data, output]) @@ -262,20 +270,21 @@ def test_maxpool2d_nhwc( # Note that we'll eventually need it in two different layouts: # (1) NHWC as an argument to testing.poolnd_python. # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc. - # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8) - a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE) + # a_numpy_4d = np.random.randint(low=-128, high=127, + # size=input_tensor_shape_4d, dtype=np.int8) + a_numpy_4d = _create_test_input(input_tensor_shape_4d, dtype) ref_output_4d = testing.poolnd_python( a_numpy_4d.astype("int32"), - KERNEL, - STRIDE, - DILATION, - PADDING[0:2], - PADDING[2:], + kernel, + stride, + dilation, + padding[0:2], + padding[2:], pool_type="max", dtype="int32", layout="NHWC", - ).astype(DTYPE) + ).astype(dtype) output_tensor_shape_4d = ref_output_4d.shape @@ -285,28 +294,25 @@ def test_maxpool2d_nhwc( hexagon_session.device, tensor_shape=input_tensor_shape_7d, axis_separators=[4], - dtype=DTYPE, - mem_scope=IO_TENSOR_MEM_SCOPE, + dtype=dtype, + mem_scope=io_tensor_mem_scope, ) c_hexagon_4d = allocate_hexagon_array( hexagon_session.device, tensor_shape=output_tensor_shape_4d, axis_separators=[], - dtype=DTYPE, - mem_scope=IO_TENSOR_MEM_SCOPE, + dtype=dtype, + mem_scope=io_tensor_mem_scope, ) a_hexagon_7d.copyfrom(a_numpy_7d) - if DTYPE == "int8": + if dtype == "int8": rel_tolerance = 0 abs_tolerance = 0 else: - assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}" - - # hexagon_mod(a_hexagon_7d, c_hexagon_4d) - # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance) + assert False, f"TODO: decide acceptable tolerances for dtype {dtype}" timer = hexagon_mod.time_evaluator( "main", hexagon_session.device, number=10, repeat=1 @@ -317,29 +323,29 @@ def test_maxpool2d_nhwc( tvm.testing.assert_allclose( ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance ) - except AssertionError as e: - raise bu.NumericalAccuracyException(str(e)) + except AssertionError as exception: + raise bu.NumericalAccuracyException(str(exception)) - except bu.NumericalAccuracyException as e: + except bu.NumericalAccuracyException as exception: print() print(f"FAIL: Numerical accuracy error. See log file.") log_file.write("\n") - log_file.write(f"FAIL: {e}\n") + log_file.write(f"FAIL: {exception}\n") self.benchmark_table.record_fail( **keys_dict, comments=f"Numerical accuracy error. See log file." ) - except bu.UnsupportedException as e: + except bu.UnsupportedException as exception: print() - print(f"SKIP: {e}") + print(f"SKIP: {exception}") log_file.write("\n") - log_file.write(f"SKIP: {e}\n") + log_file.write(f"SKIP: {exception}\n") self.benchmark_table.record_skip( - **keys_dict, comments=f"Unsupported configuration: {e}" + **keys_dict, comments=f"Unsupported configuration: {exception}" ) self.benchmark_table.record_success(timing_result, **keys_dict) diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py index 5ec46cf4ae70..40edbda550b7 100644 --- a/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py +++ b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py @@ -15,11 +15,6 @@ # specific language governing permissions and limitations # under the License. -import math -import struct -import numpy as np -import tvm.topi.hexagon.utils as utils - """ Test float to fixed-point conversion. We do it by constructing a numpy array with the wide range of floating-point values. These values are converted into the @@ -29,9 +24,17 @@ raised if they happened to be outside of the expected tolerance. """ +import math +import struct +import numpy as np +import tvm.topi.hexagon.utils as utils + class TestFixedPointConversion: + """Fixed point conversation test class""" + def test_fixed_point_conversion(self): + """Test fixed point conversion""" # Construct array with wide range of values fp1 = np.random.uniform(0.00001, 0.0002, size=(10)) fp2 = np.random.uniform(0.001, 0.02, size=(10)) diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py index 43feb827af42..5eac35f2d683 100644 --- a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py +++ b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test Fixed Point Multiply on Hexagon.""" + import re import numpy as np @@ -80,91 +82,92 @@ def run_module(mod, inputs): return output -in_scale_const, out_scale_const = tvm.testing.parameters( - (1.3, 30.0), - (1.37, 1.0), - (0.6, 1.0), - ((1.7, 0.6), 1.0), - ((0.007, 1.9), 1.0), -) - -multiplier, shift = tvm.testing.parameters( - (1288490240, -2), # 0.15 - (1395864320, 1), # 1.3 - (1288490188, 0), # 0.6 -) - - -@tvm.testing.requires_hexagon -def test_fixed_point_multiply(hexagon_session: Session, multiplier: int, shift: int): - ishape = (6, 32) - a = relay.var("a", relay.TensorType(ishape, "int32")) - fpm = relay.fixed_point_multiply(a, multiplier, shift) - relay_mod = tvm.IRModule.from_expr(fpm) - - with tvm.transform.PassContext(opt_level=3): - # Compile for Hexagon... - hexagon_lowered = build_module(relay_mod, HEXAGON_AOT_LLVM_TARGET) - - # Compile for LLVM... - llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm")) - - data_in = np.arange(-96, 96).reshape(ishape) - inputs = {"a": data_in} - - # Run hexagon... - hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = run_module(hexagon_mod, inputs) +class TestFixedPointMultiply: + """Fixed point Multiply test class""" - # Run llvm... - llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0))) - expected_output = run_module(llvm_mod, inputs) - - tvm.testing.assert_allclose(hexagon_output, expected_output) - - -@tvm.testing.requires_hexagon -def test_per_channel_fixed_point_multiply( - hexagon_session: Session, in_scale_const, out_scale_const -): - ishape = [1, 128, 56, 56] - axis = 1 - a = relay.var("a", shape=ishape, dtype="int32") - - # Make list of input scales from in_scale_const parameter. - if isinstance(in_scale_const, tuple): - in_scale = list(in_scale_const) * (ishape[axis] // len(in_scale_const)) - else: - in_scale = [in_scale_const] * ishape[axis] - assert len(in_scale) == ishape[axis] - - # qnn.requantize is lowered to fixed_point_multiply if zp == 0 and in_dtype == out_dtype. - iscale = relay.const(in_scale) - izero = relay.const(0) - oscale = relay.const(out_scale_const) - ozero = relay.const(0) - op = relay.qnn.op.requantize(a, iscale, izero, oscale, ozero, axis=axis, out_dtype="int32") - mod = tvm.IRModule.from_expr(op) - - with tvm.transform.PassContext(opt_level=3): - # Compile for Hexagon... - hexagon_lowered = build_module(mod, HEXAGON_AOT_LLVM_TARGET) - - # Compile for LLVM... - llvm_lowered = build_module(mod, tvm.target.Target("llvm")) - - a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape) - inputs = {"a": a_np} - - # Run hexagon... - hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered) - hexagon_output = run_module(hexagon_mod, inputs) + in_scale_const, out_scale_const = tvm.testing.parameters( + (1.3, 30.0), + (1.37, 1.0), + (0.6, 1.0), + ((1.7, 0.6), 1.0), + ((0.007, 1.9), 1.0), + ) - # Run llvm... - llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0))) - expected_output = run_module(llvm_mod, inputs) + multiplier, shift = tvm.testing.parameters( + (1288490240, -2), # 0.15 + (1395864320, 1), # 1.3 + (1288490188, 0), # 0.6 + ) - tvm.testing.assert_allclose(hexagon_output, expected_output) + @tvm.testing.requires_hexagon + def test_fixed_point_multiply(self, hexagon_session: Session, multiplier: int, shift: int): + """Fixed point multiply test.""" + ishape = (6, 32) + a = relay.var("a", relay.TensorType(ishape, "int32")) + fpm = relay.fixed_point_multiply(a, multiplier, shift) + relay_mod = tvm.IRModule.from_expr(fpm) + + with tvm.transform.PassContext(opt_level=3): + # Compile for Hexagon... + hexagon_lowered = build_module(relay_mod, HEXAGON_AOT_LLVM_TARGET) + + # Compile for LLVM... + llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm")) + + data_in = np.arange(-96, 96).reshape(ishape) + inputs = {"a": data_in} + + # Run hexagon... + hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered) + hexagon_output = run_module(hexagon_mod, inputs) + + # Run llvm... + llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0))) + expected_output = run_module(llvm_mod, inputs) + + tvm.testing.assert_allclose(hexagon_output, expected_output) + + @tvm.testing.requires_hexagon + def test_per_channel(self, hexagon_session: Session, in_scale_const, out_scale_const): + """Per channel multiply test.""" + ishape = [1, 128, 56, 56] + axis = 1 + a = relay.var("a", shape=ishape, dtype="int32") + + # Make list of input scales from in_scale_const parameter. + if isinstance(in_scale_const, tuple): + in_scale = list(in_scale_const) * (ishape[axis] // len(in_scale_const)) + else: + in_scale = [in_scale_const] * ishape[axis] + assert len(in_scale) == ishape[axis] + + # qnn.requantize is lowered to fixed_point_multiply if zp == 0 and in_dtype == out_dtype. + iscale = relay.const(in_scale) + izero = relay.const(0) + oscale = relay.const(out_scale_const) + ozero = relay.const(0) + op = relay.qnn.op.requantize(a, iscale, izero, oscale, ozero, axis=axis, out_dtype="int32") + mod = tvm.IRModule.from_expr(op) + + with tvm.transform.PassContext(opt_level=3): + # Compile for Hexagon... + hexagon_lowered = build_module(mod, HEXAGON_AOT_LLVM_TARGET) + + # Compile for LLVM... + llvm_lowered = build_module(mod, tvm.target.Target("llvm")) + + a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape) + inputs = {"a": a_np} + + # Run hexagon... + hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered) + hexagon_output = run_module(hexagon_mod, inputs) + + # Run llvm... + llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0))) + expected_output = run_module(llvm_mod, inputs) + + tvm.testing.assert_allclose(hexagon_output, expected_output) if __name__ == "__main__": diff --git a/tests/python/contrib/test_hexagon/test_memory_alloc.py b/tests/python/contrib/test_hexagon/test_memory_alloc.py index a6d011eddd5a..f44e3cd0dc36 100644 --- a/tests/python/contrib/test_hexagon/test_memory_alloc.py +++ b/tests/python/contrib/test_hexagon/test_memory_alloc.py @@ -14,8 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -import os.path +"""Test memory allocation.""" import numpy as np @@ -25,23 +24,25 @@ from .infrastructure import allocate_hexagon_array, get_hexagon_target -@tvm.testing.fixture -def generated_func(shape, scope, dtype, axis_separators): +def generated_func(shape: tuple, dtype: str, axis_separators: list): + """Generate element wise function.""" dim0, dim1 = shape @T.prim_func def elwise(a: T.handle, b: T.handle): - A = T.match_buffer(a, shape, dtype=dtype, axis_separators=axis_separators) - B = T.match_buffer(b, shape, dtype=dtype, axis_separators=axis_separators) + a_buffer = T.match_buffer(a, shape, dtype=dtype, axis_separators=axis_separators) + b_buffer = T.match_buffer(b, shape, dtype=dtype, axis_separators=axis_separators) for i, j in T.grid(dim0, dim1): with T.block("compute"): - B[i, j] = A[i, j] * T.cast(2, dtype=dtype) + b_buffer[i, j] = a_buffer[i, j] * T.cast(2, dtype=dtype) return elwise class TestMemoryAlloc: + """Memory allocation test.""" + dtype = tvm.testing.parameter("int8") shape = tvm.testing.parameter((128, 128)) @@ -53,11 +54,10 @@ class TestMemoryAlloc: ("global.ddr", [1]), ) - def test_global_axis_separator( - self, hexagon_session, generated_func, shape, dtype, scope, axis_separators - ): + def test_global_axis_separator(self, hexagon_session, shape, dtype, scope, axis_separators): + """Test with global axis separator.""" mod1 = tvm.build( - generated_func, + generated_func(shape, dtype, axis_separators), target=get_hexagon_target("v69"), ) mod2 = hexagon_session.load_module(mod1) diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py index a7f4cbc39cb1..9f4fd0768c50 100644 --- a/tests/python/contrib/test_hexagon/test_meta_schedule.py +++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py @@ -44,24 +44,31 @@ @tvm.script.ir_module class MatmulModule: + """Matmultest class""" + @T.prim_func def main( # type: ignore # pylint: disable=no-self-argument a: T.handle, b: T.handle, c: T.handle ) -> None: + """main function""" T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, (16, 16), "float32") - B = T.match_buffer(b, (16, 16), "float32") - C = T.match_buffer(c, (16, 16), "float32") + a_buffer = T.match_buffer(a, (16, 16), "float32") + b_buffer = T.match_buffer(b, (16, 16), "float32") + c_buffer = T.match_buffer(c, (16, 16), "float32") for i, j, k in T.grid(16, 16, 16): with T.block("matmul"): - vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + vi_axis, vj_axis, vk_axis = T.axis.remap("SSR", [i, j, k]) with T.init(): - C[vi, vj] = 0.0 # type: ignore - C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj] + c_buffer[vi_axis, vj_axis] = 0.0 # type: ignore + c_buffer[vi_axis, vj_axis] = ( + c_buffer[vi_axis, vj_axis] + + a_buffer[vi_axis, vk_axis] * b_buffer[vk_axis, vj_axis] + ) @tvm.testing.requires_hexagon def test_builder_runner(hexagon_launcher): + """Test builder and runner.""" if hexagon_launcher.is_simulator(): pytest.skip(msg="Tuning on simulator not supported.") @@ -96,33 +103,35 @@ def test_builder_runner(hexagon_launcher): assert result >= 0.0 -def dense(m, n, k): +def dense_compute(m, n, k): + """dense compute""" X = te.placeholder((m, k), name="X", dtype="uint8") - packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8") + packed_width = te.placeholder((n // 32, k // 4, 32, 4), name="packed_width", dtype="uint8") - ak = te.reduce_axis((0, k), name="k") + axis_k = te.reduce_axis((0, k), name="k") out = te.compute( (m, n), lambda i, j: te.sum( - X[i, ak].astype("int32") - * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype( - "int32" - ), - axis=ak, + X[i, axis_k].astype("int32") + * packed_width[ + tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(axis_k, 4), j % 32, axis_k % 4 + ].astype("int32"), + axis=axis_k, ), name="compute", ) - return [X, packedW, out] + return [X, packed_width, out] -def schedule_dense(sch, block, M, do_tune): +def schedule_dense(sch, block, m_size, do_tune): + """dense schedule""" a_y, a_x, _ = sch.get_loops(block)[-3:] if do_tune: y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128) a_yo, a_yi = sch.split(a_y, factors=y_factors) else: - a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)]) + a_yo, a_yi = sch.split(a_y, factors=[None, min(m_size, 32)]) a_xo, a_xi = sch.split(a_x, factors=[None, 32]) sch.reorder(a_yo, a_xo, a_yi, a_xi) @@ -143,51 +152,55 @@ def schedule_dense(sch, block, M, do_tune): sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN) -def verify_dense(sch, target, M, N, K, hexagon_session): +def verify_dense(sch, target, m_size, n_size, k_size, hexagon_session): + """Verify dense operator.""" f = tvm.build(sch.mod["main"], target=target, name="dense") mod = hexagon_session.load_module(f) dev = hexagon_session.device - a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8") - b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8") + a_np = np.random.uniform(1, 10, size=(m_size, k_size)).astype("uint8") + b_np = np.random.uniform(1, 10, size=(n_size, k_size)).astype("uint8") c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32")) - packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8") + pack_width = np.random.uniform(1, 10, size=(n_size // 32, (k_size // 4), 32, 4)).astype("uint8") - for r_idx in range(N // 32): - for ko in range(K // 4): + for r_idx in range(n_size // 32): + for k_output in range(k_size // 4): for s_idx in range(32): for t_idx in range(4): - packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx] + pack_width[r_idx][k_output][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ + k_output * 4 + t_idx + ] a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(packW, dev) - c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev) + b = tvm.nd.array(pack_width, dev) + c = tvm.nd.array(np.zeros((m_size, n_size), dtype="int32"), dev) mod(a, b, c) np.testing.assert_equal(c.numpy(), c_np) evaluator = mod.time_evaluator(mod.entry_name, dev, number=10) - gflops = (N * M * K) * 2 / 1e9 + gflops = (n_size * m_size * k_size) * 2 / 1e9 time_ms = evaluator(a, b, c).mean * 1e3 print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3))) @tvm.testing.requires_hexagon def test_vrmpy_dense(hexagon_launcher): + """Test vector reduce muliply dense.""" if hexagon_launcher.is_simulator(): pytest.skip(msg="Tuning on simulator not supported.") do_tune = True - M, N, K = 128, 768, 768 - workload = te.create_prim_func(dense(M, N, K)) + m_size, n_size, k_size = 128, 768, 768 + workload = te.create_prim_func(dense_compute(m_size, n_size, k_size)) if not do_tune: ir_module = tvm.IRModule({"main": workload}) sch = tvm.tir.Schedule(ir_module) block = sch.get_block("compute") - schedule_dense(sch, block, M, do_tune) + schedule_dense(sch, block, m_size, do_tune) else: with tempfile.TemporaryDirectory() as work_dir: @@ -214,19 +227,23 @@ def schedule_dense_for_tune(sch): sch = ms.tir_integration.compile_tir(database, workload, target) with hexagon_launcher.create_session() as session: - verify_dense(sch, get_hexagon_target("v68"), M, N, K, session) + verify_dense(sch, get_hexagon_target("v68"), m_size, n_size, k_size, session) # This is an example of a schedule found by vrmpy auto tensorization. # It gets 440 GFLOPS on SD888. @tvm.script.ir_module -class Module_vrmpy_auto_tensorize: +class ModuleVRMPYAutoTensorize: + """Vector Reduce Multimply auto tensorize test class.""" + + # pylint: disable=no-self-argument @T.prim_func def main( # type: ignore X: T.Buffer[(128, 768), "uint8"], # type: ignore - packedW: T.Buffer[(24, 192, 32, 4), "uint8"], # type: ignore + packed_width: T.Buffer[(24, 192, 32, 4), "uint8"], # type: ignore compute: T.Buffer[(128, 768), "int32"], # type: ignore ) -> None: + """Main function.""" T.func_attr({"global_symbol": "main", "tir.noalias": True}) for i0_0_i1_0_0_fused in T.parallel( 512, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1} @@ -251,33 +268,42 @@ def main( # type: ignore T.reads( compute[i, j_o * 32 : j_o * 32 + 32], # type: ignore X[i, k_o * 4 : k_o * 4 + 4], # type: ignore - packedW[j_o, k_o, 0:32, 0:4], # type: ignore + packed_width[j_o, k_o, 0:32, 0:4], # type: ignore ) T.writes(compute[i, j_o * 32 : j_o * 32 + 32]) # type: ignore - A = T.match_buffer( - X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1 # type: ignore + a_buffer = T.match_buffer( + X[i, k_o * 4 : k_o * 4 + 4], + [4], + dtype="uint8", + offset_factor=1, # type: ignore ) - B = T.match_buffer( - packedW[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1 + b_buffer = T.match_buffer( + packed_width[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1 ) - C = T.match_buffer( - compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1 # type: ignore + c_buffer = T.match_buffer( + compute[i, j_o * 32 : j_o * 32 + 32], + [32], + dtype="int32", + offset_factor=1, # type: ignore ) - A_u8x4: T.uint8x4 = A[0:4] # type: ignore - A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32") # type: ignore - B_i32x32: T.int32x32 = T.reinterpret(B[0, 0:128], dtype="int32x32") # type: ignore - C[0:32] = T.call_llvm_pure_intrin( # type: ignore - 4390, T.uint32(3), C[0:32], B_i32x32, A_i32, dtype="int32x32" + a_u8x4: T.uint8x4 = a_buffer[0:4] # type: ignore + a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32") # type: ignore + b_i32x32: T.int32x32 = T.reinterpret( + b_buffer[0, 0:128], dtype="int32x32" + ) # type: ignore + c_buffer[0:32] = T.call_llvm_pure_intrin( # type: ignore + 4390, T.uint32(3), C[0:32], b_i32x32, a_i32, dtype="int32x32" ) @tvm.testing.requires_hexagon def test_vrmpy_dense_auto_tensorize(hexagon_launcher): + """Test VRMPY dense operator.""" if hexagon_launcher.is_simulator(): pytest.skip(msg="Tuning on simulator not supported.") - M, N, K = 128, 768, 768 - workload = te.create_prim_func(dense(M, N, K)) + m_size, n_size, k_size = 128, 768, 768 + workload = te.create_prim_func(dense_compute(m_size, n_size, k_size)) sch_rules = [ schedule_rule.MultiLevelTilingWithIntrin( @@ -308,7 +334,8 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher): ] # Make this to False to compile and run the best tuned schedule - if True: + run_tuning = True + if run_tuning: with tempfile.TemporaryDirectory() as work_dir: target = get_hexagon_target("v68") database = ms.tir_integration.tune_tir( @@ -328,25 +355,26 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher): ) sch = ms.tir_integration.compile_tir(database, workload, target) else: - sch = tvm.tir.Schedule(Module_vrmpy_auto_tensorize, debug_mask="all") + sch = tvm.tir.Schedule(ModuleVRMPYAutoTensorize, debug_mask="all") with hexagon_launcher.create_session() as session: - verify_dense(sch, get_hexagon_target("v68"), M, N, K, session) + verify_dense(sch, get_hexagon_target("v68"), m_size, n_size, k_size, session) @tvm.testing.requires_hexagon def test_conv2d_relay_auto_schedule(hexagon_launcher): + """Test conv2d using auto schedule.""" if hexagon_launcher.is_simulator(): pytest.skip(msg="Tuning on simulator not supported.") - I, O, H, W = 64, 64, 56, 56 - kH = kW = 3 + i_size, o_size, h_size, w_size = 64, 64, 56, 56 + k_height_size = k_width_size = 3 strides = (1, 1) padding = (1, 1) - d_shape = (1, H, W, I) - w_shape = (kH, kW, I, O) + d_shape = (1, h_size, w_size, i_size) + w_shape = (k_height_size, k_width_size, i_size, o_size) bias_shape = (1, 1, 1, w_shape[3]) out_channel = w_shape[3] @@ -356,7 +384,7 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher): conv2d = relay.nn.conv2d( data=data, weight=weight, - kernel_size=(kH, kW), + kernel_size=(k_height_size, k_width_size), channels=out_channel, padding=padding, strides=strides, diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py index 046f949a761f..15273afdd41e 100644 --- a/tests/python/contrib/test_hexagon/test_parallel_hvx.py +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py @@ -26,7 +26,11 @@ from .infrastructure import get_hexagon_target -TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n -Single Thread: {} ms \n -Parallel: {} ms\n -Speedup: {}x\n" +TEST_OUTPUT_TEMPLATE = ( + "Test {} with {} operations... \n" + " -Single Thread: {} ms \n" + " -Parallel: {} ms\n -Speedup: {}x\n" +) def get_vrmpy_shape_dtypes(operations): @@ -61,28 +65,30 @@ def vrmpy_expected_producer(shape, a, b): expected = np.zeros(shape, dtype="int32") for n in range(shape[0]): for i in range(32): - for r in range(4): - expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r]) * np.uint32( - b[n, i * 4 + r] + for r_ind in range(4): + expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r_ind]) * np.uint32( + b[n, i * 4 + r_ind] ) return expected def get_vmpy_operator(operations): + """Generate vector multiply operator""" + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations, 128], dtype="uint8") - B = T.match_buffer(b, [operations, 128], dtype="uint8") - C = T.match_buffer(c, [operations, 128], dtype="int16") + a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8") + b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8") + c_buffer = T.match_buffer(c, [operations, 128], dtype="int16") for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin( + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vmpybusv.128B"), T.uint32(2), - T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), - T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"), dtype="int16x128", ) @@ -90,20 +96,22 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: def get_vadd_operator(operations): + """Generate vadd operator.""" + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations, 128], dtype="uint8") - B = T.match_buffer(b, [operations, 128], dtype="uint8") - C = T.match_buffer(c, [operations, 128], dtype="int16") + a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8") + b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8") + c_buffer = T.match_buffer(c, [operations, 128], dtype="int16") for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin( + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vaddubh.128B"), T.uint32(2), - T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), - T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"), dtype="int16x128", ) @@ -111,20 +119,22 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: def get_vrmpy_operator(operations): + """Generate vrmpy operator.""" + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations, 128], dtype="uint8") - B = T.match_buffer(b, [operations, 128], dtype="uint8") - C = T.match_buffer(c, [operations, 32], dtype="int32") + a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8") + b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8") + c_buffer = T.match_buffer(c, [operations, 32], dtype="int32") for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin( + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_buffer[vn_ind, T.ramp(0, 1, 32)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), T.uint32(2), - T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), - T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"), dtype="int32x32", ) @@ -132,6 +142,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch): + """Evaluate schedule.""" a_shape, a_dtype, b_shape, b_dtype, c_shape, c_dtype = shape_dtypes func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v68")) @@ -160,6 +171,7 @@ def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch): class TestMatMulVec: + """MatMul test class.""" ( operation_name, @@ -182,9 +194,11 @@ class TestMatMulVec: 128, # 256, # 512, - # 1024, # Single thread runs faster since L2 cache can handle the entire request quickly + # Single thread runs faster since L2 cache can handle the entire request quickly + # 1024, # 2048, - # 4096, # Significant performance degredation once the inputs and outputs cannot all fit in L2 + # Significant performance degredation once the inputs and outputs cannot all fit in L2 + # 4096, # 8192, # 16384, ) @@ -200,6 +214,7 @@ def test( expected_output_producer, split_factor, ): + """Test function handler.""" sch = tvm.tir.Schedule(operator_producer(operation_count)) single_thread_runtime = evaluate( @@ -207,10 +222,10 @@ def test( ) sch = tvm.tir.Schedule(operator_producer(operation_count)) - block = sch.get_block("C") + block = sch.get_block("c_buffer") b = sch.get_loops(block) - bo, _ = sch.split(b[0], factors=[split_factor, None]) - sch.parallel(bo) + b_output, _ = sch.split(b[0], factors=[split_factor, None]) + sch.parallel(b_output) parallel_runtime = evaluate( hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py index 6e43298a4eb5..f720f67ea354 100644 --- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py @@ -25,14 +25,24 @@ from .infrastructure import get_hexagon_target -TEST_OUTPUT_TEMPLATE = "Test with {} MB of data to load... \n -No VTCM: {} Gops \n -Basic VTCM: {} Gops \n -Vectorized: {} Gops\n -Vectorized and Parallelized: {} Gops\n -Preallocated and Vectorized: {} Gops\n -Preallocated, Vectorized, and Parallelized: {} Gops\n -Single DMA: {} Gops\n -Preloaded: {} Gops\n" +TEST_OUTPUT_TEMPLATE = ( + "Test with {} MB of data to load... \n" + " -No VTCM: {} Gops \n -Basic VTCM: {} Gops \n" + " -Vectorized: {} Gops\n -Vectorized and" + " Parallelized: {} Gops\n -Preallocated and Vectorized: {} Gops\n" + " -Preallocated, Vectorized, and Parallelized: {} Gops\n" + " -Single DMA: {} Gops\n -Preloaded: {} Gops\n" +) def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vector_split): + """Apply parallel unroll vectorized.""" for block in blocks: - vb, vi = sch.get_loops(block) - v = sch.fuse(vb, vi) - vbo, vbi, vio, vii = sch.split(v, factors=[outer_split, None, unroll_split, vector_split]) + vb_index, vi_index = sch.get_loops(block) + v = sch.fuse(vb_index, vi_index) + vbo, vbi, vio, vii = sch.split( # pylint: disable=unused-variable + v, factors=[outer_split, None, unroll_split, vector_split] + ) # pylint: disable=unused-variable sch.vectorize(vii) sch.unroll(vio) sch.parallel(vbo) @@ -41,8 +51,8 @@ def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vect def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split): for block in blocks: - vb, vi = sch.get_loops(block) - v = sch.fuse(vb, vi) + vb_index, vi_index = sch.get_loops(block) + v = sch.fuse(vb_index, vi_index) _, vio, vii = sch.split(v, factors=[None, unroll_split, vector_split]) sch.vectorize(vii) sch.unroll(vio) @@ -50,15 +60,15 @@ def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split): def apply_vrmpy_parallelization(sch): - block = sch.get_block("C") + block = sch.get_block("c_buffer") b = sch.get_loops(block) - bo, _ = sch.split(b[0], factors=[4, None]) - sch.parallel(bo) + b_outer, _ = sch.split(b[0], factors=[4, None]) + sch.parallel(b_outer) return sch def apply_vtcm_cache_read_write(sch): - block = sch.get_block("C") + block = sch.get_block("c_buffer") sch.cache_read(block, 0, "global.vtcm") sch.cache_read(block, 1, "global.vtcm") sch.cache_write(block, 0, "global.vtcm") @@ -66,20 +76,22 @@ def apply_vtcm_cache_read_write(sch): def vrmpy(operations): + """Generate VRMPY operator""" + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128) - B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128) - C = T.match_buffer(c, [operations, 32], dtype="int32", align=128) + a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8", align=128) + b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8", align=128) + c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128) for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin( + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_buffer[vn_ind, T.ramp(0, 1, 32)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), T.uint32(2), - T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), - T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"), dtype="int32x32", ) @@ -87,34 +99,40 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: def preloaded_vrmpy(operations): + """Generate preloaded VRMPY operator.""" + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer( + a_buffer = T.match_buffer( a, [T.cast(operations, "int32") * 128], dtype="uint8", align=128, mem_scope="global.vtcm", ) - B = T.match_buffer( + b_buffer = T.match_buffer( b, [T.cast(operations, "int32") * 128], dtype="uint8", align=128, mem_scope="global.vtcm", ) - C = T.match_buffer( + c_buffer = T.match_buffer( c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, mem_scope="global.vtcm" ) for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_buffer[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), T.uint32(2), - T.reinterpret(A[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"), - T.reinterpret(B[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"), + T.reinterpret( + a_buffer[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], dtype="int32x32" + ), + T.reinterpret( + b_buffer[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], dtype="int32x32" + ), dtype="int32x32", ) @@ -122,6 +140,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: def preallocated_vrmpy(operations): + """Generate preallocated VRMPY operator.""" size = operations * 128 out_size = operations * 32 @@ -130,49 +149,56 @@ def operator( a: T.handle, b: T.handle, c: T.handle, a_v: T.handle, b_v: T.handle, c_v: T.handle ) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global") - B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global") - C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global") - A_global_vtcm = T.match_buffer( + a_buffer = T.match_buffer( + a, [operations, 128], dtype="uint8", align=128, mem_scope="global" + ) + b_buffer = T.match_buffer( + b, [operations, 128], dtype="uint8", align=128, mem_scope="global" + ) + c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global") + a_global_vtcm = T.match_buffer( a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" ) - B_global_vtcm = T.match_buffer( + b_global_vtcm = T.match_buffer( b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" ) - C_global_vtcm = T.match_buffer( + c_global_vtcm = T.match_buffer( c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm" ) for n, i in T.grid(operations, 128): with T.block("A_global.vtcm"): - vn, vi = T.axis.remap("SS", [n, i]) - A_global_vtcm[vn * 128 + vi] = A[vn, vi] + vn_ind, vi_index = T.axis.remap("SS", [n, i]) + a_global_vtcm[vn_ind * 128 + vi_index] = a_buffer[vn_ind, vi_index] for n, i in T.grid(operations, 128): with T.block("B_global.vtcm"): - vn, vi = T.axis.remap("SS", [n, i]) - B_global_vtcm[vn * 128 + vi] = B[vn, vi] + vn_ind, vi_index = T.axis.remap("SS", [n, i]) + b_global_vtcm[vn_ind * 128 + vi_index] = b_buffer[vn_ind, vi_index] for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), T.uint32(2), T.reinterpret( - A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + a_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], + dtype="int32x32", ), T.reinterpret( - B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + b_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], + dtype="int32x32", ), dtype="int32x32", ) for n, i in T.grid(operations, 32): with T.block("C_global.vtcm"): - vn, vi = T.axis.remap("SS", [n, i]) - C[vn, vi] = C_global_vtcm[vn * 32 + vi] + vn_ind, vi_index = T.axis.remap("SS", [n, i]) + c_buffer[vn_ind, vi_index] = c_global_vtcm[vn_ind * 32 + vi_index] return operator def preallocated_single_dma_vrmpy(operations): + """Generate preallocated single DMA VRMPY operator.""" size = operations * 128 out_size = operations * 32 @@ -186,36 +212,40 @@ def operator( c_v: T.handle, ) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global") - B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global") - C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global") - A_global_vtcm = T.match_buffer( + a_buffer = T.match_buffer( + a, [operations, 128], dtype="uint8", align=128, mem_scope="global" + ) + b_buffer = T.match_buffer( + b, [operations, 128], dtype="uint8", align=128, mem_scope="global" + ) + c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global") + a_global_vtcm = T.match_buffer( a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" ) - B_global_vtcm = T.match_buffer( + b_global_vtcm = T.match_buffer( b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" ) - C_global_vtcm = T.match_buffer( + c_global_vtcm = T.match_buffer( c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm" ) T.evaluate( T.tvm_call_packed( "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( - A_global_vtcm.data, + a_global_vtcm.data, T.tvm_stack_make_shape(size, dtype="handle"), 0, 1, - A_global_vtcm.dtype, + a_global_vtcm.dtype, 0, dtype="handle", ), T.tvm_stack_make_array( - A.data, + a_buffer.data, T.tvm_stack_make_shape(size, dtype="handle"), 0, 1, - A.dtype, + a_buffer.dtype, 0, dtype="handle", ), @@ -227,20 +257,20 @@ def operator( T.tvm_call_packed( "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( - B_global_vtcm.data, + b_global_vtcm.data, T.tvm_stack_make_shape(size, dtype="handle"), 0, 1, - B_global_vtcm.dtype, + b_global_vtcm.dtype, 0, dtype="handle", ), T.tvm_stack_make_array( - B.data, + b_buffer.data, T.tvm_stack_make_shape(size, dtype="handle"), 0, 1, - B.dtype, + b_buffer.dtype, 0, dtype="handle", ), @@ -249,16 +279,18 @@ def operator( ) ) for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), T.uint32(2), T.reinterpret( - A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + a_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], + dtype="int32x32", ), T.reinterpret( - B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + b_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], + dtype="int32x32", ), dtype="int32x32", ) @@ -266,20 +298,20 @@ def operator( T.tvm_call_packed( "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( - C.data, + c_buffer.data, T.tvm_stack_make_shape(size, dtype="handle"), 0, 1, - C.dtype, + c_buffer.dtype, 0, dtype="handle", ), T.tvm_stack_make_array( - C_global_vtcm.data, + c_global_vtcm.data, T.tvm_stack_make_shape(size, dtype="handle"), 0, 1, - C_global_vtcm.dtype, + c_global_vtcm.dtype, 0, dtype="handle", ), @@ -301,6 +333,7 @@ def evaluate_result(operations, tag, time, result, expected_output): def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global"): + """Setup and run operator.""" func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69")) module = hexagon_session.load_module(func_tir) @@ -321,6 +354,7 @@ def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global") def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations): + """Setup and run for preallocated.""" func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69")) module = hexagon_session.load_module(func_tir) @@ -353,34 +387,9 @@ def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations): return gops, c_hexagon.asnumpy() -@tvm.testing.fixture -def input_a(operations): - return default_rng().integers(0, 16, (operations, 128), dtype="uint8") - - -@tvm.testing.fixture -def input_b(operations): - return default_rng().integers(0, 16, (operations, 128), dtype="uint8") - - -@tvm.testing.fixture -def input_c(operations): - return np.zeros((operations, 32), dtype="int32") - - -@tvm.testing.fixture -def expected_output(operations, input_a, input_b, input_c): - expected_output = np.zeros(input_c.shape, dtype="int32") - for n in range(operations): - for i in range(32): - for r in range(4): - expected_output[n, i] = expected_output[n, i] + np.uint32( - input_a[n, i * 4 + r] - ) * np.uint32(input_b[n, i * 4 + r]) - return expected_output - - class TestMatMulVec: + """MatMul test class.""" + # Removed most of these to speedup CI. operations = tvm.testing.parameter( 1024, @@ -398,6 +407,29 @@ class TestMatMulVec: c_vector_split = tvm.testing.parameter(16) c_vector_split_unallocated = tvm.testing.parameter(8) + @tvm.testing.fixture + def input_a(self, operations): + return default_rng().integers(0, 16, (operations, 128), dtype="uint8") + + @tvm.testing.fixture + def input_b(self, operations): + return default_rng().integers(0, 16, (operations, 128), dtype="uint8") + + @tvm.testing.fixture + def input_c(self, operations): + return np.zeros((operations, 32), dtype="int32") + + @tvm.testing.fixture + def expected_output(self, operations, input_a, input_b, input_c): + expected_output = np.zeros(input_c.shape, dtype="int32") + for n in range(operations): + for i in range(32): + for r_ind in range(4): # pylint: disable=unused-variable + expected_output[n, i] = expected_output[n, i] + np.uint32( + input_a[n, i * 4 + r_ind] + ) * np.uint32(input_b[n, i * 4 + r_ind]) + return expected_output + @tvm.testing.requires_hexagon def test_loading_vtcm_for_vrmpy( self, @@ -413,7 +445,7 @@ def test_loading_vtcm_for_vrmpy( c_vector_split, c_vector_split_unallocated, ): - + """Load VTCM for VRMPY operator test.""" # Run parallel vrmpy without loading to VTCM. sch = tvm.tir.Schedule(vrmpy(operations)) sch = apply_vrmpy_parallelization(sch) @@ -503,7 +535,7 @@ def test_loading_vtcm_for_vrmpy( sch = apply_parallel_unroll_vectorize( sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split ) - preallocated_vectorized_parallelized_runtime, result = setup_and_run_preallocated( + prealloc_vector_parallelized, result = setup_and_run_preallocated( hexagon_session, sch, input_a, input_b, input_c, operations ) result = result.reshape((operations, 32)) @@ -539,7 +571,7 @@ def test_loading_vtcm_for_vrmpy( vectorized_runtime, vectorized_parallelized_runtime, preallocated_vectorized_runtime, - preallocated_vectorized_parallelized_runtime, + prealloc_vector_parallelized, single_dma_runtime, preloaded_runtime, ) diff --git a/tests/python/contrib/test_hexagon/test_parallel_scalar.py b/tests/python/contrib/test_hexagon/test_parallel_scalar.py index fd3eef1b195b..b96265d9df99 100644 --- a/tests/python/contrib/test_hexagon/test_parallel_scalar.py +++ b/tests/python/contrib/test_hexagon/test_parallel_scalar.py @@ -25,55 +25,66 @@ from .infrastructure import get_hexagon_target -TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n -Single Thread: {} ms \n -Parallel: {} ms\n -Speedup: {}x\n" +TEST_OUTPUT_TEMPLATE = ( + "Test {} with {} operations... \n" + " -Single Thread: {} ms \n" + " -Parallel: {} ms\n -Speedup: {}x\n" +) def get_add_operator(operations): + """Generate add operator.""" + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations], dtype="float64") - B = T.match_buffer(b, [operations], dtype="float64") - C = T.match_buffer(c, [operations], dtype="float64") + a_buffer = T.match_buffer(a, [operations], dtype="float64") + b_buffer = T.match_buffer(b, [operations], dtype="float64") + c_buffer = T.match_buffer(c, [operations], dtype="float64") for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C[vn] = A[vn] + B[vn] + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_buffer[vn_ind] = a_buffer[vn_ind] + b_buffer[vn_ind] return operator def get_multiply_operator(operations): + """Generate multiply operator.""" + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations], dtype="float64") - B = T.match_buffer(b, [operations], dtype="float64") - C = T.match_buffer(c, [operations], dtype="float64") + a_buffer = T.match_buffer(a, [operations], dtype="float64") + b_buffer = T.match_buffer(b, [operations], dtype="float64") + c_buffer = T.match_buffer(c, [operations], dtype="float64") for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C[vn] = A[vn] * B[vn] + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_buffer[vn_ind] = a_buffer[vn_ind] * b_buffer[vn_ind] return operator def get_sub_operator(operations): + """Generate subtract operator.""" + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [operations], dtype="float64") - B = T.match_buffer(b, [operations], dtype="float64") - C = T.match_buffer(c, [operations], dtype="float64") + a_buffer = T.match_buffer(a, [operations], dtype="float64") + b_buffer = T.match_buffer(b, [operations], dtype="float64") + c_buffer = T.match_buffer(c, [operations], dtype="float64") for n in T.grid(operations): - with T.block("C"): - vn = T.axis.remap("S", [n]) - C[vn] = A[vn] - B[vn] + with T.block("c_buffer"): + vn_ind = T.axis.remap("S", [n]) + c_buffer[vn_ind] = a_buffer[vn_ind] - b_buffer[vn_ind] return operator def evaluate(hexagon_session, operations, expected, sch): + """Evalute schedule.""" shape = operations dtype = "float64" @@ -104,6 +115,7 @@ def evaluate(hexagon_session, operations, expected, sch): class TestMatMulVec: + """MatMul test class.""" (operation_name, operator_producer, expected_output_producer,) = tvm.testing.parameters( ("add", get_add_operator, (lambda a, b: a + b)), @@ -116,9 +128,11 @@ class TestMatMulVec: 128, # 256, # 512, - # 1024, # Single thread runs faster since L2 cache can handle the entire request quickly + # Single thread runs faster since L2 cache can handle the entire request quickly + # 1024, # 2048, - # 4096, # Significant performance degredation once the inputs and outputs cannot all fit in L2 + # Significant performance degredation once the inputs and outputs cannot all fit in L2 + # 4096, # 8192, # 16384, ) @@ -135,15 +149,16 @@ def test_add( operations, split_factor, ): + """Test Add operator.""" sch = tvm.tir.Schedule(operator_producer(operations)) single_thread_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch) sch = tvm.tir.Schedule(operator_producer(operations)) - block = sch.get_block("C") + block = sch.get_block("c_buffer") b = sch.get_loops(block) - bo, _ = sch.split(b[0], factors=[split_factor, None]) - sch.parallel(bo) + b_output, _ = sch.split(b[0], factors=[split_factor, None]) + sch.parallel(b_output) parallel_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch) speedup = round(single_thread_runtime / parallel_runtime, 2) diff --git a/tests/python/contrib/test_hexagon/test_sigmoid.py b/tests/python/contrib/test_hexagon/test_sigmoid.py index 1ff5bf3db340..e115b188a3f0 100644 --- a/tests/python/contrib/test_hexagon/test_sigmoid.py +++ b/tests/python/contrib/test_hexagon/test_sigmoid.py @@ -14,26 +14,25 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Sigmoid operator tests.""" import numpy as np -import pytest import tvm import tvm.testing from tvm import te from tvm import tir from tvm import topi -from tvm.contrib.hexagon.build import HexagonLauncher from .infrastructure import allocate_hexagon_array, get_hexagon_target -def sigmoid_compute(Input): - return topi.sigmoid(Input) +def sigmoid_compute(sigmoid_input): + return topi.sigmoid(sigmoid_input) -def sigmoid_stir_schedule(Input, Output): - sigmoid_func = te.create_prim_func([Input, Output]) +def sigmoid_stir_schedule(sigmoid_input, sigmoid_output): + sigmoid_func = te.create_prim_func([sigmoid_input, sigmoid_output]) sch = tir.Schedule(sigmoid_func, debug_mask="all") block = sch.get_block("compute") @@ -42,17 +41,6 @@ def sigmoid_stir_schedule(Input, Output): return sch -@tvm.testing.fixture -def input_np(in_shape, dtype, min_val, max_val): - return np.random.uniform(low=min_val, high=max_val, size=in_shape).astype(dtype) - - -@tvm.testing.fixture -def ref_output_np(input_np): - output_np = 1 / (1 + np.exp(-input_np)) - return output_np - - class BaseSigmoid: (in_shape, dtype, min_val, max_val,) = tvm.testing.parameters( ((64,), "float16", -8.0, 8.0), @@ -64,6 +52,17 @@ class BaseSigmoid: class TestSigmoid(BaseSigmoid): + """Sigmoid test class.""" + + @tvm.testing.fixture + def input_np(self, in_shape, dtype, min_val, max_val): + return np.random.uniform(low=min_val, high=max_val, size=in_shape).astype(dtype) + + @tvm.testing.fixture + def ref_output_np(self, input_np): + output_np = 1 / (1 + np.exp(-input_np)) + return output_np + @tvm.testing.requires_hexagon def test_sigmoid( self, @@ -73,11 +72,12 @@ def test_sigmoid( ref_output_np, hexagon_session, ): - InputTensor = te.placeholder(in_shape, name="InputTensor", dtype=dtype) + """Sigmoid test.""" + input_tensor = te.placeholder(in_shape, name="input_tensor", dtype=dtype) - OutputTensor = sigmoid_compute(InputTensor) + output_tensor = sigmoid_compute(input_tensor) - tir_s = sigmoid_stir_schedule(InputTensor, OutputTensor) + tir_s = sigmoid_stir_schedule(input_tensor, output_tensor) input_data = allocate_hexagon_array( hexagon_session.device, diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py index f80a579f58fe..ba7513a4f39c 100644 --- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py +++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py @@ -14,9 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Async software pipeline tests.""" -import sys -import pytest import numpy as np import tvm @@ -25,174 +24,178 @@ from .infrastructure import get_hexagon_target -outer = tvm.testing.parameter(8, 16) -inner = tvm.testing.parameter(64, 128) -dtype = tvm.testing.parameter("uint8", "float16") -scope = tvm.testing.parameter("global", "global.vtcm") -# TODO(Joseph) Turn on "multi_input_diffQ" compute type once we have upstreamed -# changes in the InjectSoftwarePipeline pass to alleviate this restriction: -# 'A dependency on multiple async stages is not supported' -comp_type = tvm.testing.parameter("single_input", "multi_input_sameQ") -# TODO(Straw) Add back "cache_write" schedule type once we have upstreamed -# buffer dependency analysis in InjectSoftwarePipeline pass -# to insert approprite TIR "wait" attributes for this schedule -sched_type = tvm.testing.parameter("cache_read", "cache_read_write") - - -@tvm.testing.fixture -def data(comp_type, outer, inner, dtype): - out_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype) - a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype) - if comp_type == "single_input": - return out_np, a_np - else: - b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype) - return out_np, a_np, b_np - -@tvm.testing.fixture def compute(comp_type, outer, inner, dtype): + """Generate compute function.""" if comp_type == "single_input": @T.prim_func def a_plus_1_primfunc( - A: T.Buffer[(outer, inner), dtype], OUT: T.Buffer[(outer, inner), dtype] + a_buffer: T.Buffer[(outer, inner), dtype], out: T.Buffer[(outer, inner), dtype] ): for i in T.serial(outer): for j in T.serial(inner): with T.block("compute"): with T.block(): - OUT[i, j] = A[i, j] + T.cast(1, dtype) + out[i, j] = a_buffer[i, j] + T.cast(1, dtype) return a_plus_1_primfunc else: @T.prim_func def a_plus_b_plus_1_primfunc( - A: T.Buffer[(outer, inner), dtype], - B: T.Buffer[(outer, inner), dtype], - OUT: T.Buffer[(outer, inner), dtype], + a_buffer: T.Buffer[(outer, inner), dtype], + b_buffer: T.Buffer[(outer, inner), dtype], + out: T.Buffer[(outer, inner), dtype], ): for i in T.serial(outer): for j in T.serial(inner): with T.block("compute"): with T.block(): - OUT[i, j] = A[i, j] + B[i, j] + T.cast(1, dtype) + out[i, j] = a_buffer[i, j] + b_buffer[i, j] + T.cast(1, dtype) return a_plus_b_plus_1_primfunc -@tvm.testing.fixture -def reference(comp_type): - if comp_type == "single_input": - - def a_plus_1_ref(a): - return a + 1 - - return a_plus_1_ref - else: - - def a_plus_b_plus_1_ref(a, b): - return a + b + 1 - - return a_plus_b_plus_1_ref - - -@tvm.testing.fixture -def schedule(comp_type, compute, sched_type, scope): - sch = tir.Schedule(compute) - - compute_block = sch.get_block("compute") - i, _ = sch.get_loops(compute_block) - - if "read" in sched_type: - cache_read_a = sch.cache_read(compute_block, 0, scope) - sch.compute_at(cache_read_a, i) +class TestAsyncSoftwarePipeline: + """Async software pipeline test class.""" + + outer = tvm.testing.parameter(8, 16) + inner = tvm.testing.parameter(64, 128) + dtype = tvm.testing.parameter("uint8", "float16") + scope = tvm.testing.parameter("global", "global.vtcm") + # TODO(Joseph) Turn on "multi_input_diffQ" compute type once we have upstreamed + # changes in the InjectSoftwarePipeline pass to alleviate this restriction: + # 'a_buffer dependency on multiple async stages is not supported' + comp_type = tvm.testing.parameter("single_input", "multi_input_sameQ") + # TODO(Straw) Add back "cache_write" schedule type once we have upstreamed + # buffer dependency analysis in InjectSoftwarePipeline pass + # to insert approprite TIR "wait" attributes for this schedule + sched_type = tvm.testing.parameter("cache_read", "cache_read_write") + + @tvm.testing.fixture + def data(self, comp_type, outer, inner, dtype): + out_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype) + a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype) + if comp_type == "single_input": + return out_np, a_np + else: + b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype) + return out_np, a_np, b_np + + @tvm.testing.fixture + def verify(self, dtype): + def check(out, ref): + if "int" in dtype: + np.testing.assert_equal(out.numpy(), ref) + else: + np.testing.assert_allclose(out.numpy(), ref, rtol=1e-3, atol=1e-3) + + return check + + @tvm.testing.fixture + def reference(self, comp_type): + """Returns reference data.""" + if comp_type == "single_input": - if "multi_input" in comp_type: - cache_read_b = sch.cache_read(compute_block, 1, scope) - sch.compute_at(cache_read_b, i) + def a_plus_1_ref(a): + return a + 1 - if "write" in sched_type: - cache_write_out = sch.cache_write(compute_block, 0, scope) - sch.reverse_compute_at(cache_write_out, i) + return a_plus_1_ref + else: - if "read" in sched_type and "write" in sched_type: - if comp_type == "single_input": - sch.annotate(i, "software_pipeline_stage", [0, 1, 2]) - sch.annotate(i, "software_pipeline_order", [0, 1, 2]) - sch.annotate(i, "software_pipeline_async_stages", [0, 2]) - elif comp_type == "multi_input_sameQ": - sch.annotate(i, "software_pipeline_stage", [0, 0, 1, 2]) - sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3]) - sch.annotate(i, "software_pipeline_async_stages", [0, 2]) - elif comp_type == "multi_input_diffQ": - sch.annotate(i, "software_pipeline_stage", [0, 1, 2, 3]) - sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3]) - sch.annotate(i, "software_pipeline_async_stages", [0, 1, 2]) - - elif "read" in sched_type: - if comp_type == "single_input": + def a_plus_b_plus_1_ref(a, b): + return a + b + 1 + + return a_plus_b_plus_1_ref + + @tvm.testing.fixture + def schedule(self, comp_type, sched_type, outer, inner, dtype, scope): + """Generate schedule.""" + sch = tir.Schedule(compute(comp_type, outer, inner, dtype)) + + compute_block = sch.get_block("compute") + i, _ = sch.get_loops(compute_block) + + if "read" in sched_type: + cache_read_a = sch.cache_read(compute_block, 0, scope) + sch.compute_at(cache_read_a, i) + + if "multi_input" in comp_type: + cache_read_b = sch.cache_read(compute_block, 1, scope) + sch.compute_at(cache_read_b, i) + + if "write" in sched_type: + cache_write_out = sch.cache_write(compute_block, 0, scope) + sch.reverse_compute_at(cache_write_out, i) + + if "read" in sched_type and "write" in sched_type: + if comp_type == "single_input": + sch.annotate(i, "software_pipeline_stage", [0, 1, 2]) + sch.annotate(i, "software_pipeline_order", [0, 1, 2]) + sch.annotate(i, "software_pipeline_async_stages", [0, 2]) + elif comp_type == "multi_input_sameQ": + sch.annotate(i, "software_pipeline_stage", [0, 0, 1, 2]) + sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3]) + sch.annotate(i, "software_pipeline_async_stages", [0, 2]) + elif comp_type == "multi_input_diffQ": + sch.annotate(i, "software_pipeline_stage", [0, 1, 2, 3]) + sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3]) + sch.annotate(i, "software_pipeline_async_stages", [0, 1, 2]) + + elif "read" in sched_type: + if comp_type == "single_input": + sch.annotate(i, "software_pipeline_stage", [0, 1]) + sch.annotate(i, "software_pipeline_order", [0, 1]) + sch.annotate(i, "software_pipeline_async_stages", [0]) + elif comp_type == "multi_input_sameQ": + sch.annotate(i, "software_pipeline_stage", [0, 0, 1]) + sch.annotate(i, "software_pipeline_order", [0, 1, 2]) + sch.annotate(i, "software_pipeline_async_stages", [0]) + elif comp_type == "multi_input_diffQ": + sch.annotate(i, "software_pipeline_stage", [0, 1, 2]) + sch.annotate(i, "software_pipeline_order", [0, 1, 2]) + sch.annotate(i, "software_pipeline_async_stages", [0, 1]) + + elif "write" in sched_type: sch.annotate(i, "software_pipeline_stage", [0, 1]) sch.annotate(i, "software_pipeline_order", [0, 1]) - sch.annotate(i, "software_pipeline_async_stages", [0]) - elif comp_type == "multi_input_sameQ": - sch.annotate(i, "software_pipeline_stage", [0, 0, 1]) - sch.annotate(i, "software_pipeline_order", [0, 1, 2]) - sch.annotate(i, "software_pipeline_async_stages", [0]) - elif comp_type == "multi_input_diffQ": - sch.annotate(i, "software_pipeline_stage", [0, 1, 2]) - sch.annotate(i, "software_pipeline_order", [0, 1, 2]) - sch.annotate(i, "software_pipeline_async_stages", [0, 1]) - - elif "write" in sched_type: - sch.annotate(i, "software_pipeline_stage", [0, 1]) - sch.annotate(i, "software_pipeline_order", [0, 1]) - sch.annotate(i, "software_pipeline_async_stages", [1]) - - return sch - - -@tvm.testing.fixture -def verify(dtype): - def check(out, ref): - if "int" in dtype: - np.testing.assert_equal(out.numpy(), ref) - else: - np.testing.assert_allclose(out.numpy(), ref, rtol=1e-3, atol=1e-3) - - return check + sch.annotate(i, "software_pipeline_async_stages", [1]) + return sch -@tvm.testing.requires_hexagon -def test_async_software_pipeline(hexagon_launcher, comp_type, data, reference, schedule, verify): - out_np = data[0] - a_np = data[1] - if comp_type == "single_input": - ref = reference(a_np) - else: - b_np = data[2] - ref = reference(a_np, b_np) - - with tvm.transform.PassContext( - config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False} + @tvm.testing.requires_hexagon + def test_async_software_pipeline( + self, hexagon_launcher, comp_type, data, reference, schedule, verify ): - # tvm.lower(schedule.mod["main"]).show() - func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68")) - - with hexagon_launcher.create_session() as hexagon_session: - dev = hexagon_session.device - mod = hexagon_session.load_module(func) - out = tvm.nd.array(out_np, device=dev) - a = tvm.nd.array(a_np, device=dev) + """Async software pipeline test.""" + out_np = data[0] + a_np = data[1] if comp_type == "single_input": - mod(a, out) + ref = reference(a_np) else: - b = tvm.nd.array(b_np, device=dev) - mod(a, b, out) + b_np = data[2] + ref = reference(a_np, b_np) + + with tvm.transform.PassContext( + config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False} + ): + # tvm.lower(schedule.mod["main"]).show() + func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68")) + + with hexagon_launcher.create_session() as hexagon_session: + dev = hexagon_session.device + mod = hexagon_session.load_module(func) + out = tvm.nd.array(out_np, device=dev) + a = tvm.nd.array(a_np, device=dev) + if comp_type == "single_input": + mod(a, out) + else: + b = tvm.nd.array(b_np, device=dev) + mod(a, b, out) - verify(out, ref) + verify(out, ref) if __name__ == "__main__": - sys.exit(pytest.main(sys.argv)) + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py index 307d3a96bf15..980ac0cf4c2a 100644 --- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -27,47 +27,56 @@ MB = 1024**2 KB = 1024 -TEST_OUTPUT_TEMPLATE = "Test bandwidth with buffer size {}MB... \n -Base: {} GBps \n -Vectorized: {} GBps\n -Vectorized and Parallelized: {} GBps\n -Single DMA Copy: {} GBps\n" +TEST_OUTPUT_TEMPLATE = ( + "Test bandwidth with buffer size {}MB... \n" + " -Base: {} GBps \n -Vectorized: {} GBps\n" + " -Vectorized and Parallelized: {} GBps\n" + " -Single DMA Copy: {} GBps\n" +) def memcopy_operator(size): + """Generate memory copy operator.""" + @T.prim_func def operator(a: T.handle, a_v: T.handle) -> None: - A = T.match_buffer(a, size, dtype="int8", align=128, scope="global") - A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") + a_buffer = T.match_buffer(a, size, dtype="int8", align=128, scope="global") + a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") for ax0 in T.serial(size): with T.block("A_global.vtcm"): - v0 = T.axis.spatial(size, ax0) - T.reads(A[v0]) - T.writes(A_global_vtcm[v0]) - A_global_vtcm[v0] = A[v0] + v0_ind = T.axis.spatial(size, ax0) + T.reads(a_buffer[v0_ind]) + T.writes(a_global_vtcm[v0_ind]) + a_global_vtcm[v0_ind] = a_buffer[v0_ind] return operator def single_dma_operator(size): + """Generate single dma operator.""" + @T.prim_func def operator(a: T.handle, a_v: T.handle) -> None: - A = T.match_buffer(a, size, dtype="int8", align=128, scope="global") - A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") + a_buffer = T.match_buffer(a, size, dtype="int8", align=128, scope="global") + a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") T.evaluate( T.tvm_call_packed( "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( - A_global_vtcm.data, + a_global_vtcm.data, T.tvm_stack_make_shape(size, dtype="handle"), 0, 1, - A_global_vtcm.dtype, + a_global_vtcm.dtype, 0, dtype="handle", ), T.tvm_stack_make_array( - A.data, + a_buffer.data, T.tvm_stack_make_shape(size, dtype="handle"), 0, 1, - A.dtype, + a_buffer.dtype, 0, dtype="handle", ), @@ -80,6 +89,7 @@ def operator(a: T.handle, a_v: T.handle) -> None: def evaluate(hexagon_session, sch, size): + """Evaluate schedule.""" a_shape = size func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69")) @@ -110,6 +120,7 @@ def evaluate(hexagon_session, sch, size): class TestMatMulVec: + """MatMul test class.""" # Removed most of these to speedup CI. size = tvm.testing.parameter( @@ -133,7 +144,7 @@ class TestMatMulVec: @tvm.testing.requires_hexagon def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split): - + """Test bandwidth.""" # Run the base memcopy operator. sch = tvm.tir.Schedule(memcopy_operator(size)) base_gpbs = evaluate(hexagon_session, sch, size) @@ -141,8 +152,8 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto # Run with some basic unroll and vectorize scheduling. sch = tvm.tir.Schedule(memcopy_operator(size)) vtcm_block_a = sch.get_block("A_global.vtcm") - vb = sch.get_loops(vtcm_block_a) - vbi_a, vio_a, vii_a = sch.split(vb[0], factors=[None, unroll_split, vector_split]) + v_block = sch.get_loops(vtcm_block_a) + _, vio_a, vii_a = sch.split(v_block[0], factors=[None, unroll_split, vector_split]) sch.unroll(vio_a) sch.vectorize(vii_a) vectorize_gbps = evaluate(hexagon_session, sch, size) @@ -150,9 +161,9 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto # Run with some basic unroll and vectorize scheduling and parallelization. sch = tvm.tir.Schedule(memcopy_operator(size)) vtcm_block_a = sch.get_block("A_global.vtcm") - vb = sch.get_loops(vtcm_block_a) - vbo_a, vbi_a, vio_a, vii_a = sch.split( - vb[0], factors=[outer_split, None, unroll_split, vector_split] + v_block = sch.get_loops(vtcm_block_a) + vbo_a, _, vio_a, vii_a = sch.split( + v_block[0], factors=[outer_split, None, unroll_split, vector_split] ) sch.unroll(vio_a) sch.vectorize(vii_a) @@ -169,3 +180,7 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps ) ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py index 2fc607c0c521..e4edf2919a00 100644 --- a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py +++ b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py @@ -14,8 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""No QNN canonicalization tests.""" -import pytest import numpy as np import tvm.testing @@ -27,6 +27,7 @@ @tvm.testing.requires_hexagon def test_no_qnn_pass(): + """No QNN pass test.""" x = relay.var("x", shape=(4, 8), dtype="float32") op0 = relay.qnn.op.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8") op1 = relay.qnn.op.dequantize(op0, relay.const(0.5), relay.const(5)) @@ -61,6 +62,7 @@ def execute(executor, data_np, weight_np, bias_np=None): @tvm.testing.requires_hexagon def test_qnn_conv2d_rq(hexagon_session: Session): + """QNN conv2d test.""" data_shape = [1, 8, 32, 32] weight_shape = [16, 8, 3, 3] data = relay.var("data", shape=data_shape, dtype="float32") @@ -119,6 +121,7 @@ def test_qnn_conv2d_rq(hexagon_session: Session): @tvm.testing.requires_hexagon def test_qnn_dense_bias_rq(hexagon_session: Session): + """QNN dense with bias test.""" data_shape = [8, 8] weight_shape = [16, 8] bias_shape = [16] From fe11aec3990d4f314f5a56c1ee7cd7e02dce0761 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Fri, 28 Oct 2022 22:46:01 +0000 Subject: [PATCH 2/3] fix error --- .../test_hexagon/test_async_dma_pipeline.py | 4 ++++ .../contrib/test_hexagon/test_cache_read_write.py | 4 ++++ .../contrib/test_hexagon/test_meta_schedule.py | 15 +++++++++------ .../test_hexagon/test_parallel_hvx_load_vtcm.py | 4 ++++ .../contrib/test_hexagon/test_run_unit_tests.py | 4 ++++ 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py index c5830387160b..a7a05c2aa3a7 100644 --- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py +++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py @@ -823,3 +823,7 @@ def test_meta(hexagon_session): "pipeline_runtime": pipeline_runtime, }, ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py index af5e7a398870..3ac297fd80d8 100644 --- a/tests/python/contrib/test_hexagon/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py @@ -220,3 +220,7 @@ def _visit(stmt): "AllocateNode found in lowered IRModule, " "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope" ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py index 9f4fd0768c50..a83a3b279a7f 100644 --- a/tests/python/contrib/test_hexagon/test_meta_schedule.py +++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py @@ -46,11 +46,10 @@ class MatmulModule: """Matmultest class""" + # pylint: disable=no-self-argument @T.prim_func - def main( # type: ignore # pylint: disable=no-self-argument - a: T.handle, b: T.handle, c: T.handle - ) -> None: - """main function""" + def main(a: T.handle, b: T.handle, c: T.handle) -> None: # type: ignore + # pylint: disable=missing-function-docstring T.func_attr({"global_symbol": "main", "tir.noalias": True}) a_buffer = T.match_buffer(a, (16, 16), "float32") b_buffer = T.match_buffer(b, (16, 16), "float32") @@ -243,7 +242,7 @@ def main( # type: ignore packed_width: T.Buffer[(24, 192, 32, 4), "uint8"], # type: ignore compute: T.Buffer[(128, 768), "int32"], # type: ignore ) -> None: - """Main function.""" + # pylint: disable=missing-function-docstring T.func_attr({"global_symbol": "main", "tir.noalias": True}) for i0_0_i1_0_0_fused in T.parallel( 512, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1} @@ -292,7 +291,7 @@ def main( # type: ignore b_buffer[0, 0:128], dtype="int32x32" ) # type: ignore c_buffer[0:32] = T.call_llvm_pure_intrin( # type: ignore - 4390, T.uint32(3), C[0:32], b_i32x32, a_i32, dtype="int32x32" + 4390, T.uint32(3), c_buffer[0:32], b_i32x32, a_i32, dtype="int32x32" ) @@ -495,3 +494,7 @@ def test_dense_relay_auto_schedule(hexagon_launcher): # Fairly loose check since fp16 results between x86 and Hexagon have # non-trivial difference. assert np.mean(np.abs(ref - out)) < 0.1 + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py index f720f67ea354..ebcea50e0bae 100644 --- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py @@ -576,3 +576,7 @@ def test_loading_vtcm_for_vrmpy( preloaded_runtime, ) ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py index 24c9f33a8ecb..cd4e5c9b0d66 100644 --- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py +++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py @@ -48,3 +48,7 @@ def test_run_unit_tests(hexagon_session: Session, gtest_args): raise RuntimeError( f"Hexagon gtest retruned non-zero error code = {gtest_error_code}:\n{gtest_output}" ) + + +if __name__ == "__main__": + tvm.testing.main() From f24020e6be2fe7989d58e656e8c95b399f989b13 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Wed, 2 Nov 2022 07:17:25 -0700 Subject: [PATCH 3/3] Fix buffer name --- .../test_parallel_hvx_load_vtcm.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py index ebcea50e0bae..fb398f43977a 100644 --- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py @@ -166,11 +166,11 @@ def operator( c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm" ) for n, i in T.grid(operations, 128): - with T.block("A_global.vtcm"): + with T.block("a_buffer_global.vtcm"): vn_ind, vi_index = T.axis.remap("SS", [n, i]) a_global_vtcm[vn_ind * 128 + vi_index] = a_buffer[vn_ind, vi_index] for n, i in T.grid(operations, 128): - with T.block("B_global.vtcm"): + with T.block("b_buffer_global.vtcm"): vn_ind, vi_index = T.axis.remap("SS", [n, i]) b_global_vtcm[vn_ind * 128 + vi_index] = b_buffer[vn_ind, vi_index] for n in T.grid(operations): @@ -190,7 +190,7 @@ def operator( dtype="int32x32", ) for n, i in T.grid(operations, 32): - with T.block("C_global.vtcm"): + with T.block("c_buffer_global.vtcm"): vn_ind, vi_index = T.axis.remap("SS", [n, i]) c_buffer[vn_ind, vi_index] = c_global_vtcm[vn_ind * 32 + vi_index] @@ -328,7 +328,7 @@ def evaluate_result(operations, tag, time, result, expected_output): gops = round(operations * 128 * 3 / time.mean / 1e9, 3) mean_ms = round(time.mean * 1000, 6) - print("\ntest_{}MB_{} took {} ms @ GOPS: {}".format(transfer_mb, tag, mean_ms, gops)) + print(f"\ntest_{transfer_mb}MB_{tag} took {mean_ms} ms @ GOPS: {gops}") tvm.testing.assert_allclose(result, expected_output) @@ -469,12 +469,12 @@ def test_loading_vtcm_for_vrmpy( sch = apply_vrmpy_parallelization(sch) sch = apply_unroll_vectorize( sch, - [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")], unroll_split, vector_split, ) sch = apply_unroll_vectorize( - sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split_unallocated + sch, [sch.get_block("c_buffer_global.vtcm")], unroll_split, c_vector_split_unallocated ) vectorized_runtime, result = setup_and_run( hexagon_session, sch, input_a, input_b, input_c, operations @@ -487,14 +487,14 @@ def test_loading_vtcm_for_vrmpy( sch = apply_vrmpy_parallelization(sch) sch = apply_parallel_unroll_vectorize( sch, - [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")], outer_split, unroll_split, vector_split, ) sch = apply_parallel_unroll_vectorize( sch, - [sch.get_block("C_global.vtcm")], + [sch.get_block("c_buffer_global.vtcm")], outer_split, unroll_split, c_vector_split_unallocated, @@ -509,12 +509,12 @@ def test_loading_vtcm_for_vrmpy( sch = apply_vrmpy_parallelization(sch) sch = apply_unroll_vectorize( sch, - [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")], unroll_split, vector_split, ) sch = apply_unroll_vectorize( - sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split + sch, [sch.get_block("c_buffer_global.vtcm")], unroll_split, c_vector_split ) preallocated_vectorized_runtime, result = setup_and_run_preallocated( hexagon_session, sch, input_a, input_b, input_c, operations @@ -527,13 +527,13 @@ def test_loading_vtcm_for_vrmpy( sch = apply_vrmpy_parallelization(sch) sch = apply_parallel_unroll_vectorize( sch, - [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")], outer_split, unroll_split, vector_split, ) sch = apply_parallel_unroll_vectorize( - sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split + sch, [sch.get_block("c_buffer_global.vtcm")], outer_split, unroll_split, c_vector_split ) prealloc_vector_parallelized, result = setup_and_run_preallocated( hexagon_session, sch, input_a, input_b, input_c, operations