From 0cf18d06516f13a539b444795b7de2effbadfd58 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Wed, 30 Mar 2022 08:33:25 +0100
Subject: [PATCH] [microNPU] Tweak a layout transform matrix (#10763)

* [microNPU] Fix layout transform matrix

One of the layout transforms currently causes the cascader to stripe
across B16 axis (which is not allowed), so change that and deal with
the implications to the get_valid_block_configs.

Change-Id: I04199f9f35fcc31618581567483cfb80d3b5aad2

* Reduce the duplication of layout transfrom matrices

* Change the nhcwb16_to_nhwc matrix for binary and unary elementwise
  such that it matches the other NPU ops
* Reduce the number of places where the same layout transform matrices are
  defined

* Add documentation to the layout transform matrices
---
 .../contrib/ethosu/cascader/device_config.py  | 91 ++++++++++---------
 .../contrib/ethosu/te/binary_elementwise.py   | 18 +---
 .../relay/backend/contrib/ethosu/te/common.py | 60 ++++++++++++
 .../backend/contrib/ethosu/te/convolution.py  | 18 +---
 .../backend/contrib/ethosu/te/depthwise.py    | 18 +---
 .../backend/contrib/ethosu/te/pooling.py      | 18 +---
 .../contrib/ethosu/te/unary_elementwise.py    | 18 +---
 src/contrib/ethosu/cascader/block_config.cc   |  2 +
 .../contrib/test_ethosu/cascader/infra.py     | 29 +++---
 .../test_ethosu_binary_elementwise_matcher.py | 51 ++---------
 .../cascader/test_ethosu_block_config.py      | 43 +++++----
 .../cascader/test_ethosu_conv2d_matcher.py    |  1 +
 .../test_ethosu_depthwise2d_matcher.py        |  1 +
 .../cascader/test_ethosu_pooling_matcher.py   |  1 +
 .../test_ethosu_unary_elementwise_matcher.py  | 40 ++------
 15 files changed, 182 insertions(+), 227 deletions(-)
 create mode 100644 python/tvm/relay/backend/contrib/ethosu/te/common.py

diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index 4670a238cf960..5abdb302234bc 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -439,6 +439,23 @@ def is_partkernel(
 
         return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8
 
+    def _get_input_banks(self, input_block_shape, input_bytewidth):
+        input_bytes = input_block_shape.area() * self._align(
+            input_block_shape.depth * input_bytewidth, 8
+        )
+        input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
+        input_banks = _round_up(input_banks, self._input_granularity)
+
+        return input_banks
+
+    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
+        acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
+        acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
+        acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
+        acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
+
+        return acc_banks
+
     def get_elementwise_block_config(
         self,
         ifm_propagator: Propagator,
@@ -533,16 +550,9 @@ def get_elementwise_block_config(
             input2_block.round_up(self._input_micro_block)
 
             # Banks required for input block
-            input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
-            input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-            input_banks = _round_up(input_banks, self._input_granularity)
-
+            input_banks = self._get_input_banks(input_block, input_bytewidth)
             # Banks required for input2 block
-            input2_bytes = input2_block.area() * self._align(
-                input2_block.depth * input_bytewidth, 8
-            )
-            input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
-            input2_banks = _round_up(input2_banks, self._input_granularity)
+            input2_banks = self._get_input_banks(input2_block, input_bytewidth)
 
             # Check whether or not both IFMs fit into SHRAM
             if (input_banks + input2_banks) <= banks_available:
@@ -561,6 +571,29 @@ def get_elementwise_block_config(
 
         return block_config
 
+    def _get_subkernel_propagator(
+        self, op_attrs, ifm_propagator, input_layout, output_layout, depth
+    ):
+        op_type = op_attrs.get("op")
+        stride_h = int(op_attrs.get("stride_h", 1))
+        stride_w = int(op_attrs.get("stride_w", 1))
+        transform = ifm_propagator.transform
+
+        if input_layout == "NHCWB16":
+            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+            transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
+        else:
+            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+            transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
+
+        if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
+            if output_layout == "NHCWB16" and input_layout == "NHWC":
+                transform[3][-1] = depth
+            elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
+                transform[2][-1] = depth // 16
+
+        return Propagator(transform, ifm_propagator.offset)
+
     def get_valid_block_configs(
         self,
         ifm_propagator: Propagator,
@@ -612,33 +645,13 @@ def get_valid_block_configs(
         op_type = op_attrs.get("op")
         op_str = op_attrs.get("op_str")
         activation = op_attrs.get("activation", "NONE")
-        stride_h = int(op_attrs.get("stride_h", 1))
-        stride_w = int(op_attrs.get("stride_w", 1))
         upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2
 
-        subkernel_transform = ifm_propagator.transform
         if output_layout == "NHCWB16":
             output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
         else:
             output_shape = _Shape(ofm_shape)
 
-        if input_layout == "NHCWB16":
-            subkernel_transform[1][-1] = min(
-                subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
-            )
-            subkernel_transform[3][-1] = min(
-                subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
-            )
-        else:
-            subkernel_transform[1][-1] = min(
-                subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
-            )
-            subkernel_transform[2][-1] = min(
-                subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
-            )
-
-        subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)
-
         # Define search space
         max_height = min(output_shape.height, self._max_block_shape.height)
         min_height = max(self._micro_block.height, upscaling_factor)
@@ -655,7 +668,7 @@ def get_valid_block_configs(
         if activation == "LUT" and not self._lut_reserved:
             banks_available -= 2
 
-        # Input block depth has additional limitations for Operators that require full input depth
+        # Input block depth has additional limitations for operators that require full input depth
         input_block_depth = 0
         is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
         if op_type == "ethosu_conv2d":
@@ -669,6 +682,10 @@ def get_valid_block_configs(
                 # Block depth has to be less than full depth or a multiple of the split depth
                 continue
 
+            subkernel_propagator = self._get_subkernel_propagator(
+                op_attrs, ifm_propagator, input_layout, output_layout, depth
+            )
+
             for width in range(min_width, max_width + min_width, min_width):
                 for height in range(min_height, max_height + min_height, min_height):
                     if output_layout == "NHCWB16":
@@ -709,19 +726,11 @@ def get_valid_block_configs(
                         input_block_shape.depth = input_block_depth
 
                     # Banks required for input block
-                    input_bytes = input_block_shape.area() * self._align(
-                        input_block_shape.depth * input_bytewidth, 8
-                    )
-                    input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-                    input_banks = _round_up(input_banks, self._input_granularity)
-
+                    input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
                     # Banks required for accumulation
-                    acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
-                    acc_bytes = (
-                        output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
+                    acc_banks = self._get_accumulator_banks(
+                        output_block_shape, acc_bytewidth, depth
                     )
-                    acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
-                    acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
 
                     if (input_banks + acc_banks) <= banks_available:
                         output_cycles = self._get_output_cycles(
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py
index 9581256303242..9e665009864d6 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py
@@ -22,6 +22,7 @@
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def binary_elementwise_compute(
@@ -196,21 +197,8 @@ def binary_elementwise_compute(
             attrs=binary_elementwise_attrs,
         )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ifm_channels))
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, 1, 0, 0, 0],
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/common.py b/python/tvm/relay/backend/contrib/ethosu/te/common.py
new file mode 100644
index 0000000000000..aac060308efcd
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/te/common.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Common methods for the NPU tensor expressions"""
+
+from typing import Tuple, List
+
+
+def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]], List[List[float]]]:
+    """Get the NHWC->NHCWB16 and NHCWB16->NHWC layout transform matrices.
+    For information about the supported layouts see https://developer.arm.com/documentation/102420/
+    0200/Functional-description/Control-and-data-flow/Supported-memory-formats-for-feature-maps
+
+    Parameters
+    ----------
+    ofm_channels : int
+        The number of output channels in a NHWC layout
+
+    Returns
+    -------
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc : Tuple[List[List[float]], List[List[float]]]
+        The layout transformation matrices
+    """
+
+    # The value of the last dimension (B16) is always 16.
+    nhwc_to_nhcwb16 = [
+        [1, 0, 0, 0, 0],
+        [0, 1, 0, 0, 0],
+        [0, 0, 0, 1 / 16, 0],
+        [0, 0, 1, 0, 0],
+        [0, 0, 0, 0, 16],
+        [0, 0, 0, 0, 1],
+    ]
+
+    # When we convert from NHWC to NHCWB16, the new C value is given by
+    # (ofm_channels - 1) // 16 + 1, which is a lossy operation, so we need to use
+    # the actual value of channels in the transform matrix to accurately recover
+    # the C in NHWC when we convert from NHCWB16 to NHWC.
+    nhcwb16_to_nhwc = [
+        [1, 0, 0, 0, 0, 0],
+        [0, 1, 0, 0, 0, 0],
+        [0, 0, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, ofm_channels],
+        [0, 0, 0, 0, 0, 1],
+    ]
+
+    return nhwc_to_nhcwb16, nhcwb16_to_nhwc
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
index 77bc5a300cbe4..e309ab5a2af4d 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
@@ -23,6 +23,7 @@
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def conv2d_compute(
@@ -175,21 +176,8 @@ def conv2d_compute(
         attrs=conv2d_attrs,
     )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
index 79d4f05f9cf26..03ce0e5349640 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
@@ -23,6 +23,7 @@
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def depthwise_conv2d_compute(
@@ -169,21 +170,8 @@ def depthwise_conv2d_compute(
         attrs=depthwise_conv2d_attrs,
     )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(channels)
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
index f1b065cbcf17f..8c20ea7165265 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
@@ -23,6 +23,7 @@
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def pooling_compute(
@@ -157,21 +158,8 @@ def pooling_compute(
         attrs=pooling_attrs,
     )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, stride_h, 0, 0, (pool_shape_h - stride_h)],
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py
index 69f06be955cba..50bbd36d98002 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py
@@ -21,6 +21,7 @@
 from tvm import te
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def unary_elementwise_compute(
@@ -129,21 +130,8 @@ def clz_imp(inp):
         attrs=unary_elementwise_attrs,
     )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, 1, 0, 0, 0],
diff --git a/src/contrib/ethosu/cascader/block_config.cc b/src/contrib/ethosu/cascader/block_config.cc
index afa65de013569..667d2e1ebefb3 100644
--- a/src/contrib/ethosu/cascader/block_config.cc
+++ b/src/contrib/ethosu/cascader/block_config.cc
@@ -37,6 +37,8 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("_input_shape", &tmp_arr);
   tmp_arr = make_array(output_shape_);
   v->Visit("_output_shape", &tmp_arr);
+  v->Visit("_compute_cycles", &compute_cycles_);
+  v->Visit("_output_cycles", &output_cycles_);
 }
 
 BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
index aa681c41f2108..614fed97a0a54 100644
--- a/tests/python/contrib/test_ethosu/cascader/infra.py
+++ b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -55,6 +55,7 @@ def make_simple_home_map(graph, var_region, const_region):
 
 if ethosu_enabled:
     from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants, lower_to_te
+    from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
     def create_te_graph(func):
         func, consts = extract_constants(func)
@@ -64,28 +65,24 @@ def create_te_graph(func):
         return te_graph, consts
 
     def make_matrices(
-        op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1
+        op_type,
+        kernel,
+        stride,
+        padding,
+        ifm_layout,
+        ofm_layout,
+        dilation=(1, 1),
+        ifm_channels=1,
+        ofm_channels=1,
     ):
         kernel_h, kernel_w = kernel
         stride_h, stride_w = stride
         dilation_h, dilation_w = dilation
         dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
         dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-        nhwc_to_nhcwb16 = [
-            [1, 0, 0, 0, 0],
-            [0, 1, 0, 0, 0],
-            [0, 0, 0, 1 / 16, 0],
-            [0, 0, 1, 0, 0],
-            [0, 0, 0, 0, 16],
-            [0, 0, 0, 0, 1],
-        ]
-        nhcwb16_to_nhwc = [
-            [1, 0, 0, 0, 0, 0],
-            [0, 1, 0, 0, 0, 0],
-            [0, 0, 0, 1, 0, 0],
-            [0, 0, 16, 0, 1, -16],
-            [0, 0, 0, 0, 0, 1],
-        ]
+
+        nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
+
         if op_type == "ethosu_conv2d":
             ifm_matrix = [
                 [1, 0, 0, 0, 0],
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
index bb1be7b8e251d..062e5ba0fafd5 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
@@ -27,25 +27,12 @@
     match_ethosu_binary_elementwise,
     binary_elementwise_compute,
 )
+from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
 
-def _make_matrices(broadcast, ifm_layout, ifm2_layout, ofm_layout):
+def _make_matrices(broadcast, ifm_layout, ifm2_layout, ofm_layout, ofm_channels):
     broadcast_h, broadcast_w, broadcast_c = broadcast
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, 1, 0, 0, 0],
@@ -93,14 +80,8 @@ def test_ethosu_binary_elementwise_matcher(
     ifm2_shape = [1] + [1 if (b == 1) else a for a, b in zip(ofm_shape[1:], ifm2_broadcast)]
     ifm_channels = ifm_shape[3]
     ifm2_channels = ifm2_shape[3]
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
+    ofm_channels = ofm_shape[3]
+    nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels)
     broadcast = [1 if a == 1 else 0 for a in ifm2_shape[1:]]
     if ifm_layout == "NHCWB16":
         ifm_shape = [
@@ -173,10 +154,7 @@ def test_ethosu_binary_elementwise_matcher(
     output_stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset)
 
     (ifm_transform, ifm2_transform) = _make_matrices(
-        broadcast,
-        ifm_layout,
-        ifm2_layout,
-        ofm_layout,
+        broadcast, ifm_layout, ifm2_layout, ofm_layout, ofm_channels
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
@@ -190,19 +168,10 @@ def test_ethosu_binary_elementwise_matcher(
     propagated_ifm = ifm_propagator.propagate(output_stripe_config).shape
     propagated_ifm2 = ifm2_propagator.propagate(output_stripe_config).shape
 
-    # Layout conversions will align the propagated IFMs to the brick, i.e. 16
-    # so the expected ifm(2)_shape needs to be rounded up to 16
-    if ifm_layout != ofm_layout:
-        assert ifm_shape[:-1] == propagated_ifm[:-1]
-        assert ((ifm_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm[-1]
-    else:
-        assert ifm_shape == propagated_ifm
-
-    if ifm2_layout != ofm_layout:
-        assert ifm2_shape[:-1] == propagated_ifm2[:-1]
-        assert ((ifm2_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm2[-1]
-    else:
-        assert ifm2_shape == propagated_ifm2
+    # The layout transforms that have the exact number of output channels in them
+    # will lose no information about the number of channels
+    assert ifm_shape == propagated_ifm
+    assert ifm2_shape == propagated_ifm2
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index 18f15f9257dbf..09fd056ce794c 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -22,6 +22,7 @@
 import math
 
 import tvm.contrib.ethosu.cascader as cs
+from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
 from .infra import make_matrices
 
@@ -164,7 +165,7 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 16, 1, 4, 4)),
+                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -182,7 +183,7 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 8), (1, 16, 1, 4, 8)),
+                ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -244,28 +245,23 @@ def test_best_block_config(
     acc_config,
     expected_block_configs,
 ):
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
-    ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
-        op_type, kernel, stride, padding, layouts[0], layouts[1], dilation, in_shape[3]
-    )
-
     ofm_channels = out_shape[3]
     ifm_channels = in_shape[3]
 
+    nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels)
+
+    ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
+        op_type,
+        kernel,
+        stride,
+        padding,
+        layouts[0],
+        layouts[1],
+        dilation,
+        ifm_channels,
+        ofm_channels,
+    )
+
     if layouts[0] == "NHCWB16":
         in_shape = [
             int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, in_shape + (1,)).tolist()[:-1]
@@ -321,9 +317,12 @@ def test_best_block_config(
     # Add tensors
     input_tensor = cs.Tensor(in_shape, "int8")
     part.set_input(0, input_tensor)
-    if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"):
+    if op_type == "ethosu_conv2d":
         weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8")
         part.set_input(1, weight_tensor)
+    elif op_type == "ethosu_depthwise_conv2d":
+        weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], 1], "int8")
+        part.set_input(1, weight_tensor)
 
     output_tensor = cs.Tensor(out_shape, "int8")
     part.set_output(output_tensor)
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
index 5bd2be49f6204..17b41cbaf511e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
@@ -82,6 +82,7 @@ def test_ethosu_conv2d_matcher(
         ofm_layout,
         dilation,
         ifm_channels,
+        ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
index c2c45b6524f1b..1e6b6d58b24af 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
@@ -83,6 +83,7 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou
         ifm_layout,
         ofm_layout,
         dilation,
+        ofm_channels=ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
index 6ce8ee9a2986d..b998ddaf70457 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
@@ -66,6 +66,7 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay
         padding,
         ifm_layout,
         ofm_layout,
+        ofm_channels=ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
index 0570524e09073..8139f1518f56e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
@@ -27,24 +27,11 @@
     match_ethosu_unary_elementwise,
     unary_elementwise_compute,
 )
+from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
 
-def _make_matrices(ifm_layout, ofm_layout):
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+def _make_matrices(ifm_layout, ofm_layout, ofm_channels):
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, 1, 0, 0, 0],
@@ -76,14 +63,7 @@ def _make_matrices(ifm_layout, ofm_layout):
 def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_type):
     ifm_shape = ofm_shape.copy()
     ofm_channels = ofm_shape[3]
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels)
     if ifm_layout == "NHCWB16":
         ifm_shape = [
             int(math.ceil(n))
@@ -134,7 +114,7 @@ def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_
     stripes = [0] * len(ofm_shape)
     output_stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset)
 
-    ifm_transform = _make_matrices(ifm_layout, ofm_layout)
+    ifm_transform = _make_matrices(ifm_layout, ofm_layout, ofm_channels)
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
     part = match_ethosu_unary_elementwise(out, device_config)
@@ -145,13 +125,9 @@ def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_
 
     propagated_ifm = ifm_propagator.propagate(output_stripe_config).shape
 
-    # Layout conversions will align the propagated IFMs to the brick, i.e. 16
-    # so the expected ifm_shape needs to be rounded up to 16
-    if ifm_layout != ofm_layout:
-        assert ifm_shape[:-1] == propagated_ifm[:-1]
-        assert ((ifm_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm[-1]
-    else:
-        assert ifm_shape == propagated_ifm
+    # The layout transforms that have the exact number of output channels in them
+    # will lose no information about the number of channels
+    assert ifm_shape == propagated_ifm
 
 
 if __name__ == "__main__":