Skip to content

Commit

Permalink
[microNPU] Tweak a layout transform matrix (#10763)
Browse files Browse the repository at this point in the history
* [microNPU] Fix layout transform matrix

One of the layout transforms currently causes the cascader to stripe
across B16 axis (which is not allowed), so change that and deal with
the implications to the get_valid_block_configs.

Change-Id: I04199f9f35fcc31618581567483cfb80d3b5aad2

* Reduce the duplication of layout transfrom matrices

* Change the nhcwb16_to_nhwc matrix for binary and unary elementwise
  such that it matches the other NPU ops
* Reduce the number of places where the same layout transform matrices are
  defined

* Add documentation to the layout transform matrices
  • Loading branch information
ekalda authored Mar 30, 2022
1 parent 62e0470 commit d0c7c78
Show file tree
Hide file tree
Showing 15 changed files with 182 additions and 227 deletions.
91 changes: 50 additions & 41 deletions python/tvm/contrib/ethosu/cascader/device_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,23 @@ def is_partkernel(

return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8

def _get_input_banks(self, input_block_shape, input_bytewidth):
input_bytes = input_block_shape.area() * self._align(
input_block_shape.depth * input_bytewidth, 8
)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

return input_banks

def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])

return acc_banks

def get_elementwise_block_config(
self,
ifm_propagator: Propagator,
Expand Down Expand Up @@ -533,16 +550,9 @@ def get_elementwise_block_config(
input2_block.round_up(self._input_micro_block)

# Banks required for input block
input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

input_banks = self._get_input_banks(input_block, input_bytewidth)
# Banks required for input2 block
input2_bytes = input2_block.area() * self._align(
input2_block.depth * input_bytewidth, 8
)
input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
input2_banks = _round_up(input2_banks, self._input_granularity)
input2_banks = self._get_input_banks(input2_block, input_bytewidth)

# Check whether or not both IFMs fit into SHRAM
if (input_banks + input2_banks) <= banks_available:
Expand All @@ -561,6 +571,29 @@ def get_elementwise_block_config(

return block_config

def _get_subkernel_propagator(
self, op_attrs, ifm_propagator, input_layout, output_layout, depth
):
op_type = op_attrs.get("op")
stride_h = int(op_attrs.get("stride_h", 1))
stride_w = int(op_attrs.get("stride_w", 1))
transform = ifm_propagator.transform

if input_layout == "NHCWB16":
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
else:
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)

if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
if output_layout == "NHCWB16" and input_layout == "NHWC":
transform[3][-1] = depth
elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
transform[2][-1] = depth // 16

return Propagator(transform, ifm_propagator.offset)

def get_valid_block_configs(
self,
ifm_propagator: Propagator,
Expand Down Expand Up @@ -612,33 +645,13 @@ def get_valid_block_configs(
op_type = op_attrs.get("op")
op_str = op_attrs.get("op_str")
activation = op_attrs.get("activation", "NONE")
stride_h = int(op_attrs.get("stride_h", 1))
stride_w = int(op_attrs.get("stride_w", 1))
upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2

subkernel_transform = ifm_propagator.transform
if output_layout == "NHCWB16":
output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
else:
output_shape = _Shape(ofm_shape)

if input_layout == "NHCWB16":
subkernel_transform[1][-1] = min(
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
)
subkernel_transform[3][-1] = min(
subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
)
else:
subkernel_transform[1][-1] = min(
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
)
subkernel_transform[2][-1] = min(
subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
)

subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)

# Define search space
max_height = min(output_shape.height, self._max_block_shape.height)
min_height = max(self._micro_block.height, upscaling_factor)
Expand All @@ -655,7 +668,7 @@ def get_valid_block_configs(
if activation == "LUT" and not self._lut_reserved:
banks_available -= 2

# Input block depth has additional limitations for Operators that require full input depth
# Input block depth has additional limitations for operators that require full input depth
input_block_depth = 0
is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
if op_type == "ethosu_conv2d":
Expand All @@ -669,6 +682,10 @@ def get_valid_block_configs(
# Block depth has to be less than full depth or a multiple of the split depth
continue

subkernel_propagator = self._get_subkernel_propagator(
op_attrs, ifm_propagator, input_layout, output_layout, depth
)

for width in range(min_width, max_width + min_width, min_width):
for height in range(min_height, max_height + min_height, min_height):
if output_layout == "NHCWB16":
Expand Down Expand Up @@ -709,19 +726,11 @@ def get_valid_block_configs(
input_block_shape.depth = input_block_depth

# Banks required for input block
input_bytes = input_block_shape.area() * self._align(
input_block_shape.depth * input_bytewidth, 8
)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
# Banks required for accumulation
acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
acc_bytes = (
output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
acc_banks = self._get_accumulator_banks(
output_block_shape, acc_bytewidth, depth
)
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])

if (input_banks + acc_banks) <= banks_available:
output_cycles = self._get_output_cycles(
Expand Down
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher

from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def binary_elementwise_compute(
Expand Down Expand Up @@ -196,21 +197,8 @@ def binary_elementwise_compute(
attrs=binary_elementwise_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ifm_channels))

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
Expand Down
60 changes: 60 additions & 0 deletions python/tvm/relay/backend/contrib/ethosu/te/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Common methods for the NPU tensor expressions"""

from typing import Tuple, List


def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]], List[List[float]]]:
"""Get the NHWC->NHCWB16 and NHCWB16->NHWC layout transform matrices.
For information about the supported layouts see https://developer.arm.com/documentation/102420/
0200/Functional-description/Control-and-data-flow/Supported-memory-formats-for-feature-maps
Parameters
----------
ofm_channels : int
The number of output channels in a NHWC layout
Returns
-------
nhwc_to_nhcwb16, nhcwb16_to_nhwc : Tuple[List[List[float]], List[List[float]]]
The layout transformation matrices
"""

# The value of the last dimension (B16) is always 16.
nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]

# When we convert from NHWC to NHCWB16, the new C value is given by
# (ofm_channels - 1) // 16 + 1, which is a lossy operation, so we need to use
# the actual value of channels in the transform matrix to accurately recover
# the C in NHWC when we convert from NHCWB16 to NHWC.
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, ofm_channels],
[0, 0, 0, 0, 0, 1],
]

return nhwc_to_nhcwb16, nhcwb16_to_nhwc
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/convolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher

from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def conv2d_compute(
Expand Down Expand Up @@ -175,21 +176,8 @@ def conv2d_compute(
attrs=conv2d_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
Expand Down
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher

from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def depthwise_conv2d_compute(
Expand Down Expand Up @@ -169,21 +170,8 @@ def depthwise_conv2d_compute(
attrs=depthwise_conv2d_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(channels)

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
Expand Down
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher

from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def pooling_compute(
Expand Down Expand Up @@ -157,21 +158,8 @@ def pooling_compute(
attrs=pooling_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, stride_h, 0, 0, (pool_shape_h - stride_h)],
Expand Down
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from tvm import te
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def unary_elementwise_compute(
Expand Down Expand Up @@ -129,21 +130,8 @@ def clz_imp(inp):
attrs=unary_elementwise_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
Expand Down
2 changes: 2 additions & 0 deletions src/contrib/ethosu/cascader/block_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
v->Visit("_input_shape", &tmp_arr);
tmp_arr = make_array(output_shape_);
v->Visit("_output_shape", &tmp_arr);
v->Visit("_compute_cycles", &compute_cycles_);
v->Visit("_output_cycles", &output_cycles_);
}

BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
Expand Down
Loading

0 comments on commit d0c7c78

Please sign in to comment.