diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py index 1f33e975ad69..2b2cb64b13b1 100644 --- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py +++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py @@ -107,6 +107,7 @@ def evaluate( tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output) return round(time.mean * 1000, 4) + @tvm.testing.fixture def input_a(size_a): return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8") @@ -304,9 +305,11 @@ class TestAsyncDMAPipeline: size_a = tvm.testing.parameter( 1024, 64 * 64, + 128 * 64, ) size_w = tvm.testing.parameter( + 1 * 1, 3 * 3, 9 * 9, ) @@ -326,11 +329,24 @@ def test_loading_vtcm_for_vrmpy( pytest.skip("Skipping test since it takes too long in CI.") sch = conv_approximation(size_a, size_w) - base_runtime = evaluate(hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output) + base_runtime = evaluate( + hexagon_session, + sch, + input_a, + input_w, + np.zeros(expected_output.shape, "int32"), + expected_output, + ) sch = get_fake_conv_vtcm_schedule(size_a, size_w) base_vtcm_runtime = evaluate( - hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1 + hexagon_session, + sch, + input_a, + input_w, + np.zeros(expected_output.shape, "int32"), + expected_output, + use_async_copy=1, ) sch = get_fake_conv_vtcm_schedule(size_a, size_w) @@ -339,7 +355,13 @@ def test_loading_vtcm_for_vrmpy( sch.annotate(n, "software_pipeline_order", [0, 1, 2]) sch.annotate(n, "software_pipeline_async_stages", [0]) async_input_runtime = evaluate( - hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1 + hexagon_session, + sch, + input_a, + input_w, + np.zeros(expected_output.shape, "int32"), + expected_output, + use_async_copy=1, ) sch = get_fake_conv_vtcm_schedule(size_a, size_w) @@ -348,7 +370,28 @@ def test_loading_vtcm_for_vrmpy( sch.annotate(n, "software_pipeline_order", [0, 1, 2]) sch.annotate(n, "software_pipeline_async_stages", [0, 2]) async_input_output_runtime = evaluate( - hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1 + hexagon_session, + sch, + input_a, + input_w, + np.zeros(expected_output.shape, "int32"), + expected_output, + use_async_copy=1, + ) + + sch = get_fake_conv_vtcm_schedule(size_a, size_w) + n = sch.get_loops(sch.get_block("C"))[0] + sch.annotate(n, "software_pipeline_stage", [0, 3, 6]) + sch.annotate(n, "software_pipeline_order", [0, 1, 2]) + sch.annotate(n, "software_pipeline_async_stages", [0, 6]) + async_input_output_runtime_larger_buffers = evaluate( + hexagon_session, + sch, + input_a, + input_w, + np.zeros(expected_output.shape, "int32"), + expected_output, + use_async_copy=1, ) sch = get_multi_input_fake_conv_vtcm_schedule(size_a, size_w) @@ -373,12 +416,23 @@ def test_loading_vtcm_for_vrmpy( sch.annotate(n, "software_pipeline_order", [0, 1, 2]) sch.annotate(n, "software_pipeline_async_stages", [2]) async_output_runtime = evaluate( - hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1 + hexagon_session, + sch, + input_a, + input_w, + np.zeros(expected_output.shape, "int32"), + expected_output, + use_async_copy=1, ) sch = get_single_dma_schedule(size_a, size_w) single_dma_runtime = evaluate( - hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output + hexagon_session, + sch, + input_a, + input_w, + np.zeros(expected_output.shape, "int32"), + expected_output, ) # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128 @@ -396,14 +450,20 @@ def test_loading_vtcm_for_vrmpy( "async_dma_output": async_output_runtime, "async_dma_input_output": async_input_output_runtime, "async_dma_multi_input_output": async_multi_input_output_runtime, + "async_input_output_runtime_larger_buffers": async_input_output_runtime_larger_buffers, }, ) + # from tvm.script import tir as T @tvm.script.ir_module class ModulePipelined: @T.prim_func - def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"]) -> None: + def main( + p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], + p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], + T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"], + ) -> None: # function attr dict T.func_attr({"tir.noalias": True, "global_symbol": "main"}) # body @@ -413,7 +473,9 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1 p1_global_vtcm = T.alloc_buffer([2, 1, 7, 7, 1, 32, 4], dtype="int8", scope="global.vtcm") for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(2, 1, 7, 7, 1, 32, 4): with T.block("p1_global.vtcm"): - v0, v1, v2, v3, v4, v5, v6 = T.axis.remap("SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6]) + v0, v1, v2, v3, v4, v5, v6 = T.axis.remap( + "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6] + ) T.reads(p1[v0, v1, v2, v3, v4, v5, v6]) T.writes(p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6]) p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = p1[v0, v1, v2, v3, v4, v5, v6] @@ -437,7 +499,7 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1 ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + iiii) oc_block_o = T.axis.spatial(1, 0) T.reads() - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32]) + T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32]) for i4_1 in T.vectorized(32): with T.block("conv2d_NCHWc_int8_init"): oc_block_i_init = T.axis.spatial(32, i4_1) @@ -456,16 +518,57 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1 ic_outer = T.axis.reduce(1, 0) ic_f_inner = T.axis.reduce(1, 0) ic_s_inner_o = T.axis.reduce(1, 0) - T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], p0_global_vtcm[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4]) - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32]) - A = T.match_buffer(p0_global_vtcm[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1, scope="global.vtcm") - B = T.match_buffer(p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4], [32, 4], dtype="int8", offset_factor=1, scope="global.vtcm") - C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], [32], dtype="int32", offset_factor=1, scope="global.vtcm") + T.reads( + conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32], + p0_global_vtcm[ + n, + ic_outer, + oh * 2 + kh, + ow * 2 + kw, + ic_f_inner * 4 : ic_f_inner * 4 + 4, + ], + p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4], + ) + T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32]) + A = T.match_buffer( + p0_global_vtcm[ + n, + ic_outer, + oh * 2 + kh, + ow * 2 + kw, + ic_f_inner * 4 : ic_f_inner * 4 + 4, + ], + [4], + dtype="uint8", + offset_factor=1, + scope="global.vtcm", + ) + B = T.match_buffer( + p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4], + [32, 4], + dtype="int8", + offset_factor=1, + scope="global.vtcm", + ) + C = T.match_buffer( + conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32], + [32], + dtype="int32", + offset_factor=1, + scope="global.vtcm", + ) A_u8x4: T.uint8x4 = A[0:4] A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32") B_i8x128 = B[0, 0:128] B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32") - C[0:32] = T.call_llvm_pure_intrin(4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32") + C[0:32] = T.call_llvm_pure_intrin( + 4217, + T.uint32(3), + C[0:32], + T.broadcast(A_i32, 32), + B_i32x32, + dtype="int32x32", + ) for i in T.serial(200704): with T.block("conv2d_NCHWc_int8.vtcm"): ax0_1 = T.axis.spatial(1, 0) @@ -475,30 +578,45 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1 ax4 = T.axis.spatial(32, i % 32) T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) - T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4] + T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ + ax0_1, ax1_1, ax2_1, ax3_1, ax4 + ] + # from tvm.script import tir as T @tvm.script.ir_module class ModuleBase: @T.prim_func - def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"]) -> None: + def main( + p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], + p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], + T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"], + ) -> None: # function attr dict T.func_attr({"tir.noalias": True, "global_symbol": "main"}) # buffer definition # body # with T.block("root") conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32") - for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(112, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): + for i0_0_i1_0_i2_0_i3_0_fused in T.parallel( + 112, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1} + ): for i4_0_0 in T.serial(1): - for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid(2, 1, 1, 1, 14, 8): + for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid( + 2, 1, 1, 1, 14, 8 + ): with T.block("conv2d_NCHWc_int8_o_init"): n = T.axis.spatial(1, 0) oc_chunk = T.axis.spatial(2, i1_1_init + i1_2_init) - oh = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init) - ow = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init) + oh = T.axis.spatial( + 112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init + ) + ow = T.axis.spatial( + 112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init + ) oc_block_o = T.axis.spatial(1, 0) T.reads() - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32]) + T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32]) for i4_1 in T.vectorized(32): with T.block("conv2d_NCHWc_int8_init"): oc_block_i_init = T.axis.spatial(32, i4_1) @@ -506,39 +624,103 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1 T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init]) conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0 for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(1, 1, 1, 1, 1): - for i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_2, i1_2, i2_2, i3_2, i4_0_2 in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1): + for ( + i0_1, + i1_1, + i2_1, + i3_1, + i4_0_1, + i5_1, + i6_1, + i7_1, + i8_1, + i9_0_1, + i0_2, + i1_2, + i2_2, + i3_2, + i4_0_2, + ) in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1): with T.block("conv2d_NCHWc_int8_o_update"): n = T.axis.spatial(1, 0) oc_chunk = T.axis.spatial(2, i1_1 + i1_2) - oh = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2) - ow = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2) + oh = T.axis.spatial( + 112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2 + ) + ow = T.axis.spatial( + 112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2 + ) oc_block_o = T.axis.spatial(1, 0) kh = T.axis.reduce(7, i5_0 * 7 + i5_1) kw = T.axis.reduce(7, i6_0 * 7 + i6_1) ic_outer = T.axis.reduce(1, 0) ic_f_inner = T.axis.reduce(1, 0) ic_s_inner_o = T.axis.reduce(1, 0) - T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], p0[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4]) - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32]) - A = T.match_buffer(p0[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1) - B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4], [32, 4], dtype="int8", offset_factor=1) - C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], [32], dtype="int32", offset_factor=1) + T.reads( + conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32], + p0[ + n, + ic_outer, + oh * 2 + kh, + ow * 2 + kw, + ic_f_inner * 4 : ic_f_inner * 4 + 4, + ], + p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4], + ) + T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32]) + A = T.match_buffer( + p0[ + n, + ic_outer, + oh * 2 + kh, + ow * 2 + kw, + ic_f_inner * 4 : ic_f_inner * 4 + 4, + ], + [4], + dtype="uint8", + offset_factor=1, + ) + B = T.match_buffer( + p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4], + [32, 4], + dtype="int8", + offset_factor=1, + ) + C = T.match_buffer( + conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32], + [32], + dtype="int32", + offset_factor=1, + ) A_u8x4: T.uint8x4 = A[0:4] A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32") B_i8x128 = B[0, 0:128] B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32") - C[0:32] = T.call_llvm_pure_intrin(4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32") + C[0:32] = T.call_llvm_pure_intrin( + 4217, + T.uint32(3), + C[0:32], + T.broadcast(A_i32, 32), + B_i32x32, + dtype="int32x32", + ) for ax0, ax1, ax2, ax3 in T.grid(1, 2, 14, 8): for ax4_fused in T.vectorized(32): with T.block("T_cast_2"): ax0_1, ax1_1 = T.axis.remap("SS", [ax0, ax1]) - ax2_1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + ax2) - ax3_1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3) + ax2_1 = T.axis.spatial( + 112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + ax2 + ) + ax3_1 = T.axis.spatial( + 112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3 + ) ax4 = T.axis.spatial(32, ax4_fused) T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4]) - T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4] - + T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ + ax0_1, ax1_1, ax2_1, ax3_1, ax4 + ] + @tvm.testing.requires_hexagon def test_meta(hexagon_session): @@ -552,7 +734,7 @@ def test_meta(hexagon_session): sch = tvm.tir.Schedule(ModulePipelined) compute_block = sch.get_block("conv2d_NCHWc_int8_o_update") o = sch.get_loops(compute_block)[0] - + unscheduled_vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1) sch = tvm.tir.Schedule(ModulePipelined) @@ -562,7 +744,7 @@ def test_meta(hexagon_session): sch.annotate(o, "software_pipeline_stage", [0, 1, 2]) sch.annotate(o, "software_pipeline_order", [0, 1, 2]) sch.annotate(o, "software_pipeline_async_stages", [0, 2]) - + pipeline_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1) transfer_mb = round((a.size + w.size + c.size) / 1e6, 2)