apache · echuraev · Feb 2, 2023 · Jan 23, 2023 · Jan 27, 2023 · jverma-quic
diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
@@ -93,7 +93,16 @@ def alter_op_layout_qnn_conv2d(attrs, inputs, tinfos, out_type):
 
 # qnn.dense
 register_strategy("qnn.dense", strategy.qnn_dense_strategy)
-register_pattern("qnn.dense", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+@register_alter_op_layout("qnn.dense")
+def alter_op_layout_qnn_dense(attrs, inputs, tinfos, out_type):
+    """Alternate the layout of qnn.dense"""
+    return topi.nn.qnn_dense_alter_layout(attrs, inputs, tinfos, out_type)
+
+
+# qnn.contrib_dense_pack
+register_strategy("qnn.contrib_dense_pack", strategy.qnn_dense_pack_strategy)
 
 # qnn.batch_matmul
 register_strategy("qnn.batch_matmul", strategy.qnn_batch_matmul_strategy)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
@@ -340,6 +340,62 @@ def helper_change_dtypes_to_int8(attrs, inputs, types, relay_op):
     )
 
 
+def helper_change_dtypes_to_uint8(attrs, inputs, types, relay_op):
+    """Helper function to change dtypes to uint8 x uint8.
+    Legalizes QNN dense op for Hexagon DSP. It supports fast u8 x u8 vrmpy instruction.
+
+    Converting from int8 to uint8 can be done in following manner:
+
+    Original equation
+      scale * (QA - zp_a)
+      scale * (QA + 128 - 128 - zp_a)
+      scale * ( (QA + 128) - (zp_a + 128))
+
+    Replacing QA + 128 with QA' and (zp_a + 128) with zp_a'
+    We get our new quantized uint8 tensor - scale * (QA' - zp_a')
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # Collect the dtypes.
+    data_dtype = types[0].dtype
+    kernel_dtype = types[1].dtype
+
+    # Do nothing since it is already uint8.
+    if data_dtype == "uint8" and kernel_dtype == "uint8":
+        return None
+
+    # Collect the input exprs.
+    data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale = inputs
+
+    # Shift input if necessary.
+    if data_dtype == "int8":
+        # Compute (QA + 128) and (zp_a + 128)
+        data, input_zero_point = _shift(data, input_zero_point, "uint8")
+
+    # Shift kernel if necessary.
+    if kernel_dtype == "int8":
+        # Compute (QA + 128) and (zp_a + 128)
+        kernel, kernel_zero_point = _shift(kernel, kernel_zero_point, "uint8")
+
+    # Call qnn.conv2d/qnn.dense with modified inputs and zero points.
+    new_attrs = dict(attrs)
+    return relay_op(
+        data, kernel, input_zero_point, kernel_zero_point, input_scale, kernel_scale, **new_attrs
+    )
+
+
 # Helper function to change dtypes to be same. ARM dotprod instructions prefer this setting.
 def helper_change_dtypes_to_be_same(attrs, inputs, types, relay_op):
     """Sometimes MxNet + MLDNN can lead to uint8 x int8 datatypes for the conv inputs. However,
@@ -520,7 +576,7 @@ def _qnn_conv2d_legalize_hexagon(attrs, inputs, types):
         out_channel = kernel_tensor.shape[0].value
         ic_modified = False
         oc_modified = False
-        data, kernel, input_zp, output_zp, input_scale, output_scale = inputs
+        data, kernel, data_zp, kernel_zp, data_scale, kernel_scale = inputs
 
         if in_channel % IN_CHANNEL_VECTOR_LENGTH != 0:
             new_in_channel = helper_align_up(in_channel, IN_CHANNEL_VECTOR_LENGTH)
@@ -537,21 +593,93 @@ def _qnn_conv2d_legalize_hexagon(attrs, inputs, types):
             kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0), (0, 0), (0, 0)))
             oc_modified = True
 
+            # Pad kernel zero point by 'diff' elements of 0 if it is not scalar
+            kernel_zp_tensor = types[3]
+            if len(kernel_zp_tensor.shape) != 0:
+                assert isinstance(kernel_zp, relay.Constant)
+                padded_kernel_zp_np = np.append(kernel_zp.data.numpy(), [0] * diff)
+                kernel_zp = relay.const(padded_kernel_zp_np)
+
+            # Pad kernel scale by 'diff' elements of 1.0 if it is not scalar
+            kernel_scale_tensor = types[5]
+            if len(kernel_scale_tensor.shape) != 0:
+                assert isinstance(kernel_scale, relay.Constant)
+                padded_kernel_scale_np = np.append(kernel_scale.data.numpy(), [1.0] * diff)
+                kernel_scale = relay.const(padded_kernel_scale_np)
+
         if ic_modified is True or oc_modified is True:
             new_attrs = dict(attrs)
             if oc_modified:
                 new_attrs["channels"] = new_out_channel
                 out = relay.qnn.op.conv2d(
-                    data, kernel, input_zp, output_zp, input_scale, output_scale, **new_attrs
+                    data, kernel, data_zp, kernel_zp, data_scale, kernel_scale, **new_attrs
                 )
                 output_tensor = types[6]
                 original_out_shape = list(output_tensor.shape)
                 out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
             else:
                 out = relay.qnn.op.conv2d(
-                    data, kernel, input_zp, output_zp, input_scale, output_scale, **new_attrs
+                    data, kernel, data_zp, kernel_zp, data_scale, kernel_scale, **new_attrs
                 )
 
             return out
 
     return None
+
+
+@qnn_dense_legalize.register("hexagon")
+def _qnn_dense_legalize_hexagon(attrs, inputs, types):
+    """Legalize qnn.dense op for vrmpy tensorization.
+
+    N dimension of weights should be aligned on vector length. If not, then N dimension is padded to
+    be a multiple of 32.
+    """
+    assert len(types) == 7
+    assert len(inputs) == 6
+
+    data_tensor, kernel_tensor = types[0], types[1]
+    if "int8" not in data_tensor.dtype or "int8" not in kernel_tensor.dtype:
+        return None
+
+    N, _ = kernel_tensor.shape
+
+    if N % OUT_CHANNEL_VECTOR_LENGTH != 0:
+        N_padded = helper_align_up(N, OUT_CHANNEL_VECTOR_LENGTH)
+        diff = N_padded - N
+
+        data, kernel, data_zp, kernel_zp, data_scale, kernel_scale = inputs
+
+        # Pad weights by 'diff'
+        padded_kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0)))
+
+        kernel_zp_tensor, kernel_scale_tensor = types[3], types[5]
+
+        # Pad kernel zero point by 'diff' elements of 0 if it is not scalar
+        if len(kernel_zp_tensor.shape) != 0:
+            assert isinstance(kernel_zp, relay.Constant)
+            assert isinstance(diff, tvm.tir.IntImm)
+            padded_kernel_zp_np = np.append(kernel_zp.data.numpy(), [0] * diff.value)
+            kernel_zp = relay.const(padded_kernel_zp_np)
+
+        # Pad kernel scale by 'diff' elements of 1.0 if it is not scalar
+        if len(kernel_scale_tensor.shape) != 0:
+            assert isinstance(kernel_scale, relay.Constant)
+            assert isinstance(diff, tvm.tir.IntImm)
+            padded_kernel_scale_np = np.append(kernel_scale.data.numpy(), [1.0] * diff.value)
+            kernel_scale = relay.const(padded_kernel_scale_np)
+
+        # If units is explicitly specified, it is used to compute the output shape.
+        # We need to update units after padding to prevent a type error.
+        new_attrs = dict(attrs)
+        if attrs["units"] is not None:
+            new_attrs["units"] = N + diff
+
+        new_inputs = (data, padded_kernel, data_zp, kernel_zp, data_scale, kernel_scale)
+
+        out = relay.qnn.op.dense(*new_inputs, **new_attrs)
+
+        output_tensor = types[6]
+        out = relay.strided_slice(out, begin=[0, 0], end=list(output_tensor.shape))
+        return out
+
+    return None
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
@@ -718,6 +718,70 @@ def dense(
     )
 
 
+def contrib_dense_pack(
+    data,
+    weight,
+    input_zero_point,
+    kernel_zero_point,
+    input_scale,
+    kernel_scale,
+    kernel_layout="NC",
+    units=None,
+    out_dtype="int32",
+):
+    """Qnn contrib_dense_pack operator.
+    Applies a quantized linear transformation
+
+     .. math::
+
+     `Y = X * W`
+
+    If doing Per-channel quantization, qnn expects the kernel_zero_scale
+    and optionally the kernel_zero_point will be 1-D vectors instead of scalars.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The quantized input data to the operator.
+    weight : tvm.relay.Expr
+        The quantized weight expressions.
+    input_zero_point: tvm.relay.Expr
+        The input zero point.
+    kernel_zero_point: tvm.relay.Expr
+        The kernel zero point.
+    input_scale: tvm.relay.Expr
+        The scale for the input tensor.
+    kernel_scale: tvm.relay.Expr
+        The scale for the weight tensor. The scale for the weight tensor is
+        stored for access to this during relay. This information is not
+        needed in the pass pipeline after qnn.conv2d is lowered to the
+        sequence of steps as in nn.conv2d. See also input_scale in Requantize.
+    kernel_layout: str
+        The layout of weight, such as "NC" or "NC32n4c".
+    units : int, optional
+        Number of hidden units of the dense transformation.
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense can be int32 or int16.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+
+    return _make.contrib_dense_pack(
+        data,
+        weight,
+        input_zero_point,
+        kernel_zero_point,
+        input_scale,
+        kernel_scale,
+        kernel_layout,
+        units,
+        out_dtype,
+    )
+
+
 def mul(
     lhs,
     rhs,

diff --git a/python/tvm/relay/qnn/strategy/generic.py b/python/tvm/relay/qnn/strategy/generic.py
@@ -267,6 +267,12 @@ def qnn_dense_strategy(attrs, inputs, out_type, target):
     )
 
 
+@override_native_generic_func("qnn_dense_pack_strategy")
+def qnn_dense_pack_strategy(attrs, inputs, out_type, target):
+    """qnn.contrib_dense_pack generic strategy"""
+    raise RuntimeError("qnn.contrib_dense_pack is currently only supported with Hexagon. ")
+
+
 @override_native_generic_func("qnn_batch_matmul_strategy")
 def qnn_batch_matmul_strategy(attrs, inputs, out_type, target):
     """qnn.batch_matmul generic strategy"""

diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py
@@ -173,6 +173,24 @@ def qnn_dense_strategy_hexagon(attrs, inputs, out_type, target):
     return strategy
 
 
+@qnn_dense_pack_strategy.register("hexagon")
+def qnn_dense_pack_strategy_hexagon(attrs, inputs, out_type, target):
+    """qnn.contrib_dense_pack strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    if (
+        "uint8" in inputs[0].dtype
+        and "int8" in inputs[1].dtype
+        and attrs["weight_layout"] == "NC32n4c"
+    ):
+        # uint8 + uint8|int8 case
+        strategy.add_implementation(
+            wrap_topi_qnn_dense(topi.hexagon.qnn_dense_pack_vrmpy),
+            wrap_topi_schedule(topi.hexagon.schedule_qnn_dense_pack_vrmpy),
+            name="qnn_dense_pack_vrmpy.hexagon",
+        )
+    return strategy
+
+
 @qnn_batch_matmul_strategy.register("hexagon")
 def qnn_batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
     """qnn.batch_matmul strategy for Hexagon"""

diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -20,6 +20,7 @@
 from .adaptive_avg_pool1d import *
 from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule
 from .conv2d_alter_op import *
+from .dense_alter_op import *
 from .dequantize import dequantize_compute, dequantize_schedule
 from .global_avg_pool2d import *
 from .nn import *

diff --git a/python/tvm/topi/hexagon/qnn/dense_alter_op.py b/python/tvm/topi/hexagon/qnn/dense_alter_op.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""QNN Dense alter op functions for Hexagon"""
+
+from tvm import relay
+from ..dense_alter_op import check_vrmpy_applicable
+from ...nn import qnn_dense_alter_layout
+
+
+@qnn_dense_alter_layout.register("hexagon")
+def _alter_qnn_dense_layout(_attrs, inputs, tinfos, out_type):
+    data_tensor = tinfos[0]
+    weight_tensor = tinfos[1]
+
+    if check_vrmpy_applicable(data_tensor, weight_tensor):
+        weight_layout = "NC32n4c"
+        return relay.qnn.op.contrib_dense_pack(*inputs, weight_layout, None, out_type.dtype)
+    else:
+        return None