diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py index 4670a238cf960..5abdb302234bc 100644 --- a/python/tvm/contrib/ethosu/cascader/device_config.py +++ b/python/tvm/contrib/ethosu/cascader/device_config.py @@ -439,6 +439,23 @@ def is_partkernel( return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8 + def _get_input_banks(self, input_block_shape, input_bytewidth): + input_bytes = input_block_shape.area() * self._align( + input_block_shape.depth * input_bytewidth, 8 + ) + input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2 + input_banks = _round_up(input_banks, self._input_granularity) + + return input_banks + + def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth): + acc_depth = _round_up(min(output_block_shape.depth, depth), 8) + acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth + acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2 + acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth]) + + return acc_banks + def get_elementwise_block_config( self, ifm_propagator: Propagator, @@ -533,16 +550,9 @@ def get_elementwise_block_config( input2_block.round_up(self._input_micro_block) # Banks required for input block - input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8) - input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2 - input_banks = _round_up(input_banks, self._input_granularity) - + input_banks = self._get_input_banks(input_block, input_bytewidth) # Banks required for input2 block - input2_bytes = input2_block.area() * self._align( - input2_block.depth * input_bytewidth, 8 - ) - input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2 - input2_banks = _round_up(input2_banks, self._input_granularity) + input2_banks = self._get_input_banks(input2_block, input_bytewidth) # Check whether or not both IFMs fit into SHRAM if (input_banks + input2_banks) <= banks_available: @@ -561,6 +571,29 @@ def get_elementwise_block_config( return block_config + def _get_subkernel_propagator( + self, op_attrs, ifm_propagator, input_layout, output_layout, depth + ): + op_type = op_attrs.get("op") + stride_h = int(op_attrs.get("stride_h", 1)) + stride_w = int(op_attrs.get("stride_w", 1)) + transform = ifm_propagator.transform + + if input_layout == "NHCWB16": + transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h) + transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w) + else: + transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h) + transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w) + + if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"): + if output_layout == "NHCWB16" and input_layout == "NHWC": + transform[3][-1] = depth + elif output_layout == "NHCWB16" and input_layout == "NHCWB16": + transform[2][-1] = depth // 16 + + return Propagator(transform, ifm_propagator.offset) + def get_valid_block_configs( self, ifm_propagator: Propagator, @@ -612,33 +645,13 @@ def get_valid_block_configs( op_type = op_attrs.get("op") op_str = op_attrs.get("op_str") activation = op_attrs.get("activation", "NONE") - stride_h = int(op_attrs.get("stride_h", 1)) - stride_w = int(op_attrs.get("stride_w", 1)) upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2 - subkernel_transform = ifm_propagator.transform if output_layout == "NHCWB16": output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels]) else: output_shape = _Shape(ofm_shape) - if input_layout == "NHCWB16": - subkernel_transform[1][-1] = min( - subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h - ) - subkernel_transform[3][-1] = min( - subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w - ) - else: - subkernel_transform[1][-1] = min( - subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h - ) - subkernel_transform[2][-1] = min( - subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w - ) - - subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset) - # Define search space max_height = min(output_shape.height, self._max_block_shape.height) min_height = max(self._micro_block.height, upscaling_factor) @@ -655,7 +668,7 @@ def get_valid_block_configs( if activation == "LUT" and not self._lut_reserved: banks_available -= 2 - # Input block depth has additional limitations for Operators that require full input depth + # Input block depth has additional limitations for operators that require full input depth input_block_depth = 0 is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w) if op_type == "ethosu_conv2d": @@ -669,6 +682,10 @@ def get_valid_block_configs( # Block depth has to be less than full depth or a multiple of the split depth continue + subkernel_propagator = self._get_subkernel_propagator( + op_attrs, ifm_propagator, input_layout, output_layout, depth + ) + for width in range(min_width, max_width + min_width, min_width): for height in range(min_height, max_height + min_height, min_height): if output_layout == "NHCWB16": @@ -709,19 +726,11 @@ def get_valid_block_configs( input_block_shape.depth = input_block_depth # Banks required for input block - input_bytes = input_block_shape.area() * self._align( - input_block_shape.depth * input_bytewidth, 8 - ) - input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2 - input_banks = _round_up(input_banks, self._input_granularity) - + input_banks = self._get_input_banks(input_block_shape, input_bytewidth) # Banks required for accumulation - acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8) - acc_bytes = ( - output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth + acc_banks = self._get_accumulator_banks( + output_block_shape, acc_bytewidth, depth ) - acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2 - acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth]) if (input_banks + acc_banks) <= banks_available: output_cycles = self._get_output_cycles( diff --git a/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py index 9581256303242..9e665009864d6 100644 --- a/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py +++ b/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py @@ -22,6 +22,7 @@ from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher from .dma import dma_ofm_compute, dma_ifm_compute +from .common import get_layout_transform_matrices def binary_elementwise_compute( @@ -196,21 +197,8 @@ def binary_elementwise_compute( attrs=binary_elementwise_attrs, ) - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] + nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ifm_channels)) + ifm_matrix = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], diff --git a/python/tvm/relay/backend/contrib/ethosu/te/common.py b/python/tvm/relay/backend/contrib/ethosu/te/common.py new file mode 100644 index 0000000000000..aac060308efcd --- /dev/null +++ b/python/tvm/relay/backend/contrib/ethosu/te/common.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Common methods for the NPU tensor expressions""" + +from typing import Tuple, List + + +def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]], List[List[float]]]: + """Get the NHWC->NHCWB16 and NHCWB16->NHWC layout transform matrices. + For information about the supported layouts see https://developer.arm.com/documentation/102420/ + 0200/Functional-description/Control-and-data-flow/Supported-memory-formats-for-feature-maps + + Parameters + ---------- + ofm_channels : int + The number of output channels in a NHWC layout + + Returns + ------- + nhwc_to_nhcwb16, nhcwb16_to_nhwc : Tuple[List[List[float]], List[List[float]]] + The layout transformation matrices + """ + + # The value of the last dimension (B16) is always 16. + nhwc_to_nhcwb16 = [ + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 1 / 16, 0], + [0, 0, 1, 0, 0], + [0, 0, 0, 0, 16], + [0, 0, 0, 0, 1], + ] + + # When we convert from NHWC to NHCWB16, the new C value is given by + # (ofm_channels - 1) // 16 + 1, which is a lossy operation, so we need to use + # the actual value of channels in the transform matrix to accurately recover + # the C in NHWC when we convert from NHCWB16 to NHWC. + nhcwb16_to_nhwc = [ + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, ofm_channels], + [0, 0, 0, 0, 0, 1], + ] + + return nhwc_to_nhcwb16, nhcwb16_to_nhwc diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py index 77bc5a300cbe4..e309ab5a2af4d 100644 --- a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py +++ b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py @@ -23,6 +23,7 @@ from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher from .dma import dma_ofm_compute, dma_ifm_compute +from .common import get_layout_transform_matrices def conv2d_compute( @@ -175,21 +176,8 @@ def conv2d_compute( attrs=conv2d_attrs, ) - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] + nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels) + ifm_matrix = [ [1, 0, 0, 0, 0], [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py index 79d4f05f9cf26..03ce0e5349640 100644 --- a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py +++ b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py @@ -23,6 +23,7 @@ from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher from .dma import dma_ofm_compute, dma_ifm_compute +from .common import get_layout_transform_matrices def depthwise_conv2d_compute( @@ -169,21 +170,8 @@ def depthwise_conv2d_compute( attrs=depthwise_conv2d_attrs, ) - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] + nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(channels) + ifm_matrix = [ [1, 0, 0, 0, 0], [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], diff --git a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py index f1b065cbcf17f..8c20ea7165265 100644 --- a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py +++ b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py @@ -23,6 +23,7 @@ from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher from .dma import dma_ofm_compute, dma_ifm_compute +from .common import get_layout_transform_matrices def pooling_compute( @@ -157,21 +158,8 @@ def pooling_compute( attrs=pooling_attrs, ) - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] + nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels)) + ifm_matrix = [ [1, 0, 0, 0, 0], [0, stride_h, 0, 0, (pool_shape_h - stride_h)], diff --git a/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py index 69f06be955cba..50bbd36d98002 100644 --- a/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py +++ b/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py @@ -21,6 +21,7 @@ from tvm import te from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher from .dma import dma_ofm_compute, dma_ifm_compute +from .common import get_layout_transform_matrices def unary_elementwise_compute( @@ -129,21 +130,8 @@ def clz_imp(inp): attrs=unary_elementwise_attrs, ) - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] + nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels)) + ifm_matrix = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], diff --git a/src/contrib/ethosu/cascader/block_config.cc b/src/contrib/ethosu/cascader/block_config.cc index afa65de013569..667d2e1ebefb3 100644 --- a/src/contrib/ethosu/cascader/block_config.cc +++ b/src/contrib/ethosu/cascader/block_config.cc @@ -37,6 +37,8 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) { v->Visit("_input_shape", &tmp_arr); tmp_arr = make_array(output_shape_); v->Visit("_output_shape", &tmp_arr); + v->Visit("_compute_cycles", &compute_cycles_); + v->Visit("_output_cycles", &output_cycles_); } BlockConfig::BlockConfig(const std::vector& input_shape, const std::vector& output_shape, diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py index aa681c41f2108..614fed97a0a54 100644 --- a/tests/python/contrib/test_ethosu/cascader/infra.py +++ b/tests/python/contrib/test_ethosu/cascader/infra.py @@ -55,6 +55,7 @@ def make_simple_home_map(graph, var_region, const_region): if ethosu_enabled: from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants, lower_to_te + from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices def create_te_graph(func): func, consts = extract_constants(func) @@ -64,28 +65,24 @@ def create_te_graph(func): return te_graph, consts def make_matrices( - op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1 + op_type, + kernel, + stride, + padding, + ifm_layout, + ofm_layout, + dilation=(1, 1), + ifm_channels=1, + ofm_channels=1, ): kernel_h, kernel_w = kernel stride_h, stride_w = stride dilation_h, dilation_w = dilation dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] + + nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels) + if op_type == "ethosu_conv2d": ifm_matrix = [ [1, 0, 0, 0, 0], diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py index bb1be7b8e251d..062e5ba0fafd5 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py @@ -27,25 +27,12 @@ match_ethosu_binary_elementwise, binary_elementwise_compute, ) +from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices -def _make_matrices(broadcast, ifm_layout, ifm2_layout, ofm_layout): +def _make_matrices(broadcast, ifm_layout, ifm2_layout, ofm_layout, ofm_channels): broadcast_h, broadcast_w, broadcast_c = broadcast - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] + nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels) ifm_matrix = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], @@ -93,14 +80,8 @@ def test_ethosu_binary_elementwise_matcher( ifm2_shape = [1] + [1 if (b == 1) else a for a, b in zip(ofm_shape[1:], ifm2_broadcast)] ifm_channels = ifm_shape[3] ifm2_channels = ifm2_shape[3] - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] + ofm_channels = ofm_shape[3] + nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels) broadcast = [1 if a == 1 else 0 for a in ifm2_shape[1:]] if ifm_layout == "NHCWB16": ifm_shape = [ @@ -173,10 +154,7 @@ def test_ethosu_binary_elementwise_matcher( output_stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset) (ifm_transform, ifm2_transform) = _make_matrices( - broadcast, - ifm_layout, - ifm2_layout, - ofm_layout, + broadcast, ifm_layout, ifm2_layout, ofm_layout, ofm_channels ) device_config = cs.EthosuDeviceConfig("ethos-u55-256") @@ -190,19 +168,10 @@ def test_ethosu_binary_elementwise_matcher( propagated_ifm = ifm_propagator.propagate(output_stripe_config).shape propagated_ifm2 = ifm2_propagator.propagate(output_stripe_config).shape - # Layout conversions will align the propagated IFMs to the brick, i.e. 16 - # so the expected ifm(2)_shape needs to be rounded up to 16 - if ifm_layout != ofm_layout: - assert ifm_shape[:-1] == propagated_ifm[:-1] - assert ((ifm_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm[-1] - else: - assert ifm_shape == propagated_ifm - - if ifm2_layout != ofm_layout: - assert ifm2_shape[:-1] == propagated_ifm2[:-1] - assert ((ifm2_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm2[-1] - else: - assert ifm2_shape == propagated_ifm2 + # The layout transforms that have the exact number of output channels in them + # will lose no information about the number of channels + assert ifm_shape == propagated_ifm + assert ifm2_shape == propagated_ifm2 if __name__ == "__main__": diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py index 18f15f9257dbf..09fd056ce794c 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py @@ -22,6 +22,7 @@ import math import tvm.contrib.ethosu.cascader as cs +from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices from .infra import make_matrices @@ -164,7 +165,7 @@ ((1, 6, 5, 16), (1, 6, 1, 5, 16)), ((1, 4, 4, 16), (1, 4, 1, 4, 16)), ((1, 8, 4, 16), (1, 8, 1, 4, 16)), - ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 16, 1, 4, 4)), + ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)), ((1, 6, 5, 16), (1, 6, 1, 5, 16)), # Depthwise Conv2D ((1, 6, 10, 16), (1, 6, 1, 10, 16)), @@ -182,7 +183,7 @@ ((1, 6, 5, 16), (1, 6, 1, 5, 16)), ((1, 4, 4, 16), (1, 4, 1, 4, 16)), ((1, 8, 4, 16), (1, 8, 1, 4, 16)), - ((1, 10, 6, 8), (1, 16, 1, 4, 8)), + ((1, 10, 6, 8), (1, 10, 1, 6, 8)), ((1, 6, 5, 16), (1, 6, 1, 5, 16)), # Depthwise Conv2D ((1, 6, 10, 16), (1, 6, 1, 10, 16)), @@ -244,28 +245,23 @@ def test_best_block_config( acc_config, expected_block_configs, ): - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] - ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices( - op_type, kernel, stride, padding, layouts[0], layouts[1], dilation, in_shape[3] - ) - ofm_channels = out_shape[3] ifm_channels = in_shape[3] + nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels) + + ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices( + op_type, + kernel, + stride, + padding, + layouts[0], + layouts[1], + dilation, + ifm_channels, + ofm_channels, + ) + if layouts[0] == "NHCWB16": in_shape = [ int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, in_shape + (1,)).tolist()[:-1] @@ -321,9 +317,12 @@ def test_best_block_config( # Add tensors input_tensor = cs.Tensor(in_shape, "int8") part.set_input(0, input_tensor) - if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"): + if op_type == "ethosu_conv2d": weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8") part.set_input(1, weight_tensor) + elif op_type == "ethosu_depthwise_conv2d": + weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], 1], "int8") + part.set_input(1, weight_tensor) output_tensor = cs.Tensor(out_shape, "int8") part.set_output(output_tensor) diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py index 5bd2be49f6204..17b41cbaf511e 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py @@ -82,6 +82,7 @@ def test_ethosu_conv2d_matcher( ofm_layout, dilation, ifm_channels, + ofm_channels, ) device_config = cs.EthosuDeviceConfig("ethos-u55-256") diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py index c2c45b6524f1b..1e6b6d58b24af 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py @@ -83,6 +83,7 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou ifm_layout, ofm_layout, dilation, + ofm_channels=ofm_channels, ) device_config = cs.EthosuDeviceConfig("ethos-u55-256") diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py index 6ce8ee9a2986d..b998ddaf70457 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py @@ -66,6 +66,7 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay padding, ifm_layout, ofm_layout, + ofm_channels=ofm_channels, ) device_config = cs.EthosuDeviceConfig("ethos-u55-256") diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py index 0570524e09073..8139f1518f56e 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py @@ -27,24 +27,11 @@ match_ethosu_unary_elementwise, unary_elementwise_compute, ) +from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices -def _make_matrices(ifm_layout, ofm_layout): - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] +def _make_matrices(ifm_layout, ofm_layout, ofm_channels): + nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels) ifm_matrix = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], @@ -76,14 +63,7 @@ def _make_matrices(ifm_layout, ofm_layout): def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_type): ifm_shape = ofm_shape.copy() ofm_channels = ofm_shape[3] - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] + nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels) if ifm_layout == "NHCWB16": ifm_shape = [ int(math.ceil(n)) @@ -134,7 +114,7 @@ def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_ stripes = [0] * len(ofm_shape) output_stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset) - ifm_transform = _make_matrices(ifm_layout, ofm_layout) + ifm_transform = _make_matrices(ifm_layout, ofm_layout, ofm_channels) device_config = cs.EthosuDeviceConfig("ethos-u55-256") part = match_ethosu_unary_elementwise(out, device_config) @@ -145,13 +125,9 @@ def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_ propagated_ifm = ifm_propagator.propagate(output_stripe_config).shape - # Layout conversions will align the propagated IFMs to the brick, i.e. 16 - # so the expected ifm_shape needs to be rounded up to 16 - if ifm_layout != ofm_layout: - assert ifm_shape[:-1] == propagated_ifm[:-1] - assert ((ifm_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm[-1] - else: - assert ifm_shape == propagated_ifm + # The layout transforms that have the exact number of output channels in them + # will lose no information about the number of channels + assert ifm_shape == propagated_ifm if __name__ == "__main__":