diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
old mode 100755
new mode 100644
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
old mode 100755
new mode 100644
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
old mode 100755
new mode 100644
diff --git a/python/tvm/contrib/hexagon/hexagon_profiler.py b/python/tvm/contrib/hexagon/hexagon_profiler.py
old mode 100755
new mode 100644
diff --git a/python/tvm/micro/contrib/stm32/__init__.py b/python/tvm/micro/contrib/stm32/__init__.py
old mode 100755
new mode 100644
index 80e57f2d3912..8558f5335955
--- a/python/tvm/micro/contrib/stm32/__init__.py
+++ b/python/tvm/micro/contrib/stm32/__init__.py
@@ -1,20 +1,20 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Module container of STM32 code generator."""
-
-from .emitter import CodeEmitter, get_input_tensor_name, get_output_tensor_name
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module container of STM32 code generator."""
+
+from .emitter import CodeEmitter, get_input_tensor_name, get_output_tensor_name
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
old mode 100755
new mode 100644
diff --git a/python/tvm/relay/transform/infer_layout_utils.py b/python/tvm/relay/transform/infer_layout_utils.py
old mode 100755
new mode 100644
diff --git a/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py b/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py
old mode 100755
new mode 100644
index 80f1cd1ecf78..14bdd45b56f7
--- a/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py
+++ b/python/tvm/topi/hexagon/qnn/adaptive_avg_pool1d.py
@@ -1,120 +1,120 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Compute and schedule for adaptive_avg_pool1d slice op
-
-Following are few notes and assumptions made by the implementation:
-
-Assumptions:
-1) The input is in NCW layout. Distilbert is the only model that calls
-   nn.adaptive_avg_pool1d and the only layout it uses is 'NCW'.
-2) The op takes output_size as an argument and
-   only handles the specialized case where output_size is 1.
-   The argument output_size is used as the value of output_width.
-3) Both input and output dtype is uint8/int8 and
-   quantization parameter is provided to the op.
-4) Input is assumed to always be multiple of fixed chunk 32c64w.
-
-Notes:
-1) If input width is used as output width, there can be two cases:
-    a. If the quantization parameters of input and output are same,
-       it can return the input as output so the op will be a no-op.
-    b. If the quantization parameters of input and output are different,
-       it will essentially be a requantize op.
-2) If output_size is a value besides 1 or input_width,
-   adaptive_avg_pool1d may use dynamic stride and kernel for each output element.
-   When this case occurs, kernel won't be known at compile time. We want to use
-   the generic implementation nn.adaptive_avg_pool1d() for this case.
-"""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
-
-
-def adaptive_avg_pool1d(
-    data: te.Tensor,
-    output_size: list,
-    odtype: str,
-    input_zero_point: int,
-    input_scale: float,
-    output_zero_point: int,
-    output_scale: float,
-):
-    """adaptive_avg_pool1d compute"""
-    _, _, inw = data.shape
-
-    out_width = output_size[0]
-
-    n, c = data.shape[:2]
-    oshape = (n, c) + (out_width,)
-
-    # Kernel is same as input_width since output_width is assumed to be 1
-    if out_width == 1:
-        kw_r = inw
-    else:
-        raise RuntimeError(f"Unsupported output_size, {out_width}'")
-
-    if odtype == "uint8":
-        temp_dtype = "uint32"
-    elif odtype == "int8":
-        temp_dtype = "int32"
-    else:
-        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
-
-    scale_with_area = input_scale / (output_scale * int(kw_r))
-    scale_fixed_point, rsh = get_fixed_point_value(scale_with_area, "int16")
-    corr = (output_zero_point << rsh) - input_zero_point * kw_r * scale_fixed_point
-
-    rw_r = te.reduce_axis((0, kw_r), name="rw_r")
-
-    sum_compute = te.compute(
-        oshape,
-        lambda n, c, w: te.sum(data[n, c, w + rw_r].astype(temp_dtype), axis=[rw_r]),
-        name="sum",
-    )
-
-    avg_compute = te.compute(
-        oshape,
-        lambda n, c, w: saturate(
-            ((sum_compute[n, c, w] * scale_fixed_point) + corr) >> rsh, odtype
-        ).astype(odtype),
-        name="adaptive_avg_1d",
-    )
-    return avg_compute
-
-
-def stir_schedule_ncw_32c64w(outs, ins, input_layout: str):
-    """Schedule for input layout ncw-32c64w and output layout ncw"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-
-    sum_block = s.get_block("sum")
-
-    # Input is multiple of fixed chunk but output is NxCx1
-    # Hence transform_layout is only applied on input
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    return s
-
-
-def tir_adaptive_avg_pool1d_schedule(outs, ins, output_layout: str, input_layout: str):
-    """STIR based schedule"""
-    if output_layout == "ncw":
-        return stir_schedule_ncw_32c64w(outs, ins, input_layout)
-    raise RuntimeError(f"Unexpected layout '{output_layout}'")
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Compute and schedule for adaptive_avg_pool1d slice op
+
+Following are few notes and assumptions made by the implementation:
+
+Assumptions:
+1) The input is in NCW layout. Distilbert is the only model that calls
+   nn.adaptive_avg_pool1d and the only layout it uses is 'NCW'.
+2) The op takes output_size as an argument and
+   only handles the specialized case where output_size is 1.
+   The argument output_size is used as the value of output_width.
+3) Both input and output dtype is uint8/int8 and
+   quantization parameter is provided to the op.
+4) Input is assumed to always be multiple of fixed chunk 32c64w.
+
+Notes:
+1) If input width is used as output width, there can be two cases:
+    a. If the quantization parameters of input and output are same,
+       it can return the input as output so the op will be a no-op.
+    b. If the quantization parameters of input and output are different,
+       it will essentially be a requantize op.
+2) If output_size is a value besides 1 or input_width,
+   adaptive_avg_pool1d may use dynamic stride and kernel for each output element.
+   When this case occurs, kernel won't be known at compile time. We want to use
+   the generic implementation nn.adaptive_avg_pool1d() for this case.
+"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
+
+
+def adaptive_avg_pool1d(
+    data: te.Tensor,
+    output_size: list,
+    odtype: str,
+    input_zero_point: int,
+    input_scale: float,
+    output_zero_point: int,
+    output_scale: float,
+):
+    """adaptive_avg_pool1d compute"""
+    _, _, inw = data.shape
+
+    out_width = output_size[0]
+
+    n, c = data.shape[:2]
+    oshape = (n, c) + (out_width,)
+
+    # Kernel is same as input_width since output_width is assumed to be 1
+    if out_width == 1:
+        kw_r = inw
+    else:
+        raise RuntimeError(f"Unsupported output_size, {out_width}'")
+
+    if odtype == "uint8":
+        temp_dtype = "uint32"
+    elif odtype == "int8":
+        temp_dtype = "int32"
+    else:
+        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
+
+    scale_with_area = input_scale / (output_scale * int(kw_r))
+    scale_fixed_point, rsh = get_fixed_point_value(scale_with_area, "int16")
+    corr = (output_zero_point << rsh) - input_zero_point * kw_r * scale_fixed_point
+
+    rw_r = te.reduce_axis((0, kw_r), name="rw_r")
+
+    sum_compute = te.compute(
+        oshape,
+        lambda n, c, w: te.sum(data[n, c, w + rw_r].astype(temp_dtype), axis=[rw_r]),
+        name="sum",
+    )
+
+    avg_compute = te.compute(
+        oshape,
+        lambda n, c, w: saturate(
+            ((sum_compute[n, c, w] * scale_fixed_point) + corr) >> rsh, odtype
+        ).astype(odtype),
+        name="adaptive_avg_1d",
+    )
+    return avg_compute
+
+
+def stir_schedule_ncw_32c64w(outs, ins, input_layout: str):
+    """Schedule for input layout ncw-32c64w and output layout ncw"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+
+    sum_block = s.get_block("sum")
+
+    # Input is multiple of fixed chunk but output is NxCx1
+    # Hence transform_layout is only applied on input
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    return s
+
+
+def tir_adaptive_avg_pool1d_schedule(outs, ins, output_layout: str, input_layout: str):
+    """STIR based schedule"""
+    if output_layout == "ncw":
+        return stir_schedule_ncw_32c64w(outs, ins, input_layout)
+    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py b/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
old mode 100755
new mode 100644
index 1c171be8976e..24d5224f71cf
--- a/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
+++ b/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
@@ -1,95 +1,95 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Assumptions:
-1) The input is in NCHW layout. Squeezenet is the only model that calls
-   nn.global_avg_pool2d and the only layout it uses is 'NCHW'.
-2) Both input and output dtype is uint8 and
-   quantization parameter is provided to the op.
-3) Input is assumed to always be multiple of fixed chunk 32c8h8w.
-"""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
-
-
-def global_avg_pool2d_u8(
-    data: te.Tensor,
-    odtype: str,
-    input_zero_point: int,
-    input_scale: float,
-    output_zero_point: int,
-    output_scale: float,
-):
-    """global_avg_pool2d"""
-    input_b, input_c, input_h, input_w = data.shape
-    oshape = (input_b, input_c) + (1, 1)
-
-    if input_h * input_w < 256:
-        bits = "16"
-    else:
-        bits = "32"
-
-    if odtype == "uint8":
-        temp_dtype = "uint" + bits
-    elif odtype == "int8":
-        temp_dtype = "int" + bits
-    else:
-        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
-
-    pool_area = input_h * input_w
-    rh_r = te.reduce_axis((0, input_h), name="rh_r")
-    rw_r = te.reduce_axis((0, input_w), name="rw_r")
-
-    scale_with_area = input_scale / (output_scale * int(pool_area))
-    scale_fixed_point, rsh = get_fixed_point_value(scale_with_area, "int16")
-    corr = (output_zero_point << rsh) - input_zero_point * pool_area * scale_fixed_point
-
-    sum_compute = te.compute(
-        oshape,
-        lambda n, c, h, w: te.sum(
-            data[n, c, h + rh_r, w + rw_r].astype(temp_dtype), axis=[rh_r, rw_r]
-        ),
-        name="sum",
-    )
-
-    avg_compute = te.compute(
-        oshape,
-        lambda n, c, h, w: saturate(
-            ((sum_compute[n, c, h, w] * scale_fixed_point) + corr) >> rsh, odtype
-        ).astype(odtype),
-        name="global_avg_pool2d",
-    )
-
-    return avg_compute
-
-
-def stir_global_avg_pool2d_u8_schedule(outs: te.Tensor, ins: te.Tensor, input_layout: str):
-    """Schedule"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-
-    sum_block = s.get_block("sum")
-
-    # Input is multiple of fixed chunk but output is NxCx1x1
-    # Hence transform_layout is only applied on input
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    return s
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Assumptions:
+1) The input is in NCHW layout. Squeezenet is the only model that calls
+   nn.global_avg_pool2d and the only layout it uses is 'NCHW'.
+2) Both input and output dtype is uint8 and
+   quantization parameter is provided to the op.
+3) Input is assumed to always be multiple of fixed chunk 32c8h8w.
+"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
+
+
+def global_avg_pool2d_u8(
+    data: te.Tensor,
+    odtype: str,
+    input_zero_point: int,
+    input_scale: float,
+    output_zero_point: int,
+    output_scale: float,
+):
+    """global_avg_pool2d"""
+    input_b, input_c, input_h, input_w = data.shape
+    oshape = (input_b, input_c) + (1, 1)
+
+    if input_h * input_w < 256:
+        bits = "16"
+    else:
+        bits = "32"
+
+    if odtype == "uint8":
+        temp_dtype = "uint" + bits
+    elif odtype == "int8":
+        temp_dtype = "int" + bits
+    else:
+        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
+
+    pool_area = input_h * input_w
+    rh_r = te.reduce_axis((0, input_h), name="rh_r")
+    rw_r = te.reduce_axis((0, input_w), name="rw_r")
+
+    scale_with_area = input_scale / (output_scale * int(pool_area))
+    scale_fixed_point, rsh = get_fixed_point_value(scale_with_area, "int16")
+    corr = (output_zero_point << rsh) - input_zero_point * pool_area * scale_fixed_point
+
+    sum_compute = te.compute(
+        oshape,
+        lambda n, c, h, w: te.sum(
+            data[n, c, h + rh_r, w + rw_r].astype(temp_dtype), axis=[rh_r, rw_r]
+        ),
+        name="sum",
+    )
+
+    avg_compute = te.compute(
+        oshape,
+        lambda n, c, h, w: saturate(
+            ((sum_compute[n, c, h, w] * scale_fixed_point) + corr) >> rsh, odtype
+        ).astype(odtype),
+        name="global_avg_pool2d",
+    )
+
+    return avg_compute
+
+
+def stir_global_avg_pool2d_u8_schedule(outs: te.Tensor, ins: te.Tensor, input_layout: str):
+    """Schedule"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+
+    sum_block = s.get_block("sum")
+
+    # Input is multiple of fixed chunk but output is NxCx1x1
+    # Hence transform_layout is only applied on input
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    return s
diff --git a/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py b/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py
old mode 100755
new mode 100644
index 043ad313bdef..a974ad643107
--- a/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py
+++ b/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py
@@ -1,270 +1,270 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""Compute and schedule for quantized add, multiply, subtract op
-
-Please note the following assumptions made by the implementation:
-
-1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting."""
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn, get_fixed_point_value
-
-
-def broadcast_axis(tensor_A, tensor_B):
-    """Find out the indices that will have broadcasting"""
-    A_broadcast = []
-    B_broadcast = []
-
-    for i in range(len(tensor_A.shape)):
-        if tensor_A.shape[i] == tensor_B.shape[i]:
-            A_broadcast.append(1)
-            B_broadcast.append(1)
-        elif tensor_A.shape[i] == 1:
-            A_broadcast.append(0)
-            B_broadcast.append(1)
-        elif tensor_B.shape[i] == 1:
-            A_broadcast.append(1)
-            B_broadcast.append(0)
-    return A_broadcast, B_broadcast
-
-
-def saturate(x: te.Tensor, dtype: str):
-    """Saturate value for the specified data type"""
-    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
-
-
-def get_int_scale(
-    scale_A: float,
-    scale_B: float,
-    scale_M: float,
-    zero_point_A: int,
-    zero_point_B: int,
-    zero_point_M: int,
-    op: str,
-):
-    """
-    Get fixed-point number and exp_scale_factor from topi.hexagon.utils.get_fixed_point_value.
-    Also, depending on the op, this function uses exp_scale_factor(log2 of the scale factor)
-    to adjust the output's zero_point.
-    """
-
-    C_recip = 1 / scale_M
-
-    if op == "qmul":
-        scale = scale_A * scale_B * C_recip
-        scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
-
-        # We need to adjust output's zero point value since the compute for the op is multiplied
-        # by a scaling factor.
-        # The scaling factor is 2^x where x is the exp_scale_factor which is assigned to rsh here.
-        # Since zero_point_M is multipled by 2^rsh while converting floating-point scale value
-        # into fixed-point number, we left shift it by rsh in our compute to reflect that.
-
-        corr = zero_point_M << rsh
-
-        return scale_fixed_point, rsh, corr
-
-    a_scale_f = scale_A * C_recip
-    b_scale_f = scale_B * C_recip
-    scale_fixed_point_a, rsh_a = get_fixed_point_value(a_scale_f, "int16")
-    scale_fixed_point_b, rsh_b = get_fixed_point_value(b_scale_f, "int16")
-
-    # Here we have two exp_scale_factors rsh_a and rsh_b.
-    # To avoid complexity, we want to use a common exp_scale_factor and
-    # we want to use the lowest of the two.
-
-    # Since, either of scale_fixed_point_a or scale_fixed_point_b has already been multiplied
-    # by 2^max(rsh_a, rsh_b) in topi.hexagon.utils.get_fixed_point_value,
-    # we want to undo that by right shifting that scale_fixed_point value
-    # by the difference of rsh_a and rsh_b.
-
-    # This results into having a common exp_scale_factor for both scale_fixed_point_a
-    # and scale_fixed_point_b.
-
-    # We also set rsh here which is used to adjust the zero_point_M and compute the corr value,
-    # computation of which comes from the original equation of the op's compute.
-
-    if rsh_a > rsh_b:
-        scale_fixed_point_a = scale_fixed_point_a >> (rsh_a - rsh_b)
-        rsh = rsh_b
-    else:
-        scale_fixed_point_b = scale_fixed_point_b >> (rsh_b - rsh_a)
-        rsh = rsh_a
-
-    if op == "qadd":
-        corr = (zero_point_M << rsh) - (
-            zero_point_A * scale_fixed_point_a + zero_point_B * scale_fixed_point_b
-        )
-    else:
-        corr = (zero_point_M << rsh) - (
-            zero_point_A * scale_fixed_point_a - zero_point_B * scale_fixed_point_b
-        )
-
-    return scale_fixed_point_a, scale_fixed_point_b, rsh, corr
-
-
-def qadd_broadcast_compute(
-    tensor_A: te.Tensor,
-    tensor_B: te.Tensor,
-    output_shape: list,
-    zero_point_A: int,
-    scale_A: float,
-    zero_point_B: int,
-    scale_B: float,
-    zero_point_M: int,
-    scale_M: float,
-    dtype: str,
-):
-    """Compute quantized add with broadcasting"""
-    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
-    n_a, h_a, w_a, c_a = A_broadcast
-    n_b, h_b, w_b, c_b = B_broadcast
-
-    scale_a, scale_b, rsh, corr = get_int_scale(
-        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qadd"
-    )
-
-    return te.compute(
-        output_shape,
-        lambda n, h, w, c: saturate(
-            (
-                (
-                    (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a)
-                    + (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b)
-                    + corr
-                )
-                >> rsh
-            ),
-            dtype,
-        ).astype(dtype),
-    )
-
-
-def qsubtract_broadcast_compute(
-    tensor_A: te.Tensor,
-    tensor_B: te.Tensor,
-    output_shape: list,
-    zero_point_A: int,
-    scale_A: float,
-    zero_point_B: int,
-    scale_B: float,
-    zero_point_M: int,
-    scale_M: float,
-    dtype: str,
-):
-    """Compute quantized subtract with broadcasting"""
-    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
-    n_a, h_a, w_a, c_a = A_broadcast
-    n_b, h_b, w_b, c_b = B_broadcast
-
-    scale_a, scale_b, rsh, corr = get_int_scale(
-        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qsub"
-    )
-
-    return te.compute(
-        output_shape,
-        lambda n, h, w, c: saturate(
-            (
-                (
-                    (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a)
-                    - (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b)
-                    + corr
-                )
-                >> rsh
-            ),
-            dtype,
-        ).astype(dtype),
-    )
-
-
-def qmultiply_broadcast_compute(
-    tensor_A: te.Tensor,
-    tensor_B: te.Tensor,
-    output_shape: list,
-    zero_point_A: int,
-    scale_A: float,
-    zero_point_B: int,
-    scale_B: float,
-    zero_point_M: int,
-    scale_M: float,
-    dtype: str,
-):
-    """Compute quantized multiply with broadcasting"""
-    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
-    n_a, h_a, w_a, c_a = A_broadcast
-    n_b, h_b, w_b, c_b = B_broadcast
-
-    scale_int, rsh, corr = get_int_scale(
-        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qmul"
-    )
-
-    return te.compute(
-        output_shape,
-        lambda n, h, w, c: saturate(
-            (
-                (
-                    scale_int
-                    * (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] - zero_point_A)
-                    * (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] - zero_point_B)
-                    + corr
-                )
-                >> rsh
-            ),
-            dtype,
-        ).astype(dtype),
-    )
-
-
-def tir_schedule_quant(
-    out_M: te.Tensor,
-    tensor_A: te.Tensor,
-    tensor_B: te.Tensor,
-    output_layout: str,
-    tensor_A_layout: str,
-    tensor_B_layout: str,
-):
-    """Schedule for output layout nhwc-8h8w32c-2d"""
-    func = te.create_prim_func([tensor_A, tensor_B, out_M])
-
-    s = tir.Schedule(func)
-
-    block = s.get_block("compute")
-
-    if tensor_A_layout == "nhwc-8h8w32c-2d":
-        tensor_A_transformed_layout = get_layout_transform_fn(tensor_A_layout)
-        s.transform_layout(block, buffer=tensor_A.name, index_map=tensor_A_transformed_layout)
-
-    if tensor_B_layout == "nhwc-8h8w32c-2d":
-        tensor_B_transformed_layout = get_layout_transform_fn(tensor_B_layout)
-        s.transform_layout(block, buffer=tensor_B.name, index_map=tensor_B_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout)
-
-    n, h, w, c = s.get_loops(block)
-
-    h_o, h_i = s.split(h, [None, 8])
-    w_o, w_i = s.split(w, [None, 8])
-    c_o, c_i = s.split(c, [None, 32])
-    wio, wii = s.split(w_i, [None, 4])
-
-    s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i)
-
-    return s
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Compute and schedule for quantized add, multiply, subtract op
+
+Please note the following assumptions made by the implementation:
+
+1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting."""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, get_fixed_point_value
+
+
+def broadcast_axis(tensor_A, tensor_B):
+    """Find out the indices that will have broadcasting"""
+    A_broadcast = []
+    B_broadcast = []
+
+    for i in range(len(tensor_A.shape)):
+        if tensor_A.shape[i] == tensor_B.shape[i]:
+            A_broadcast.append(1)
+            B_broadcast.append(1)
+        elif tensor_A.shape[i] == 1:
+            A_broadcast.append(0)
+            B_broadcast.append(1)
+        elif tensor_B.shape[i] == 1:
+            A_broadcast.append(1)
+            B_broadcast.append(0)
+    return A_broadcast, B_broadcast
+
+
+def saturate(x: te.Tensor, dtype: str):
+    """Saturate value for the specified data type"""
+    return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype)))
+
+
+def get_int_scale(
+    scale_A: float,
+    scale_B: float,
+    scale_M: float,
+    zero_point_A: int,
+    zero_point_B: int,
+    zero_point_M: int,
+    op: str,
+):
+    """
+    Get fixed-point number and exp_scale_factor from topi.hexagon.utils.get_fixed_point_value.
+    Also, depending on the op, this function uses exp_scale_factor(log2 of the scale factor)
+    to adjust the output's zero_point.
+    """
+
+    C_recip = 1 / scale_M
+
+    if op == "qmul":
+        scale = scale_A * scale_B * C_recip
+        scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
+
+        # We need to adjust output's zero point value since the compute for the op is multiplied
+        # by a scaling factor.
+        # The scaling factor is 2^x where x is the exp_scale_factor which is assigned to rsh here.
+        # Since zero_point_M is multipled by 2^rsh while converting floating-point scale value
+        # into fixed-point number, we left shift it by rsh in our compute to reflect that.
+
+        corr = zero_point_M << rsh
+
+        return scale_fixed_point, rsh, corr
+
+    a_scale_f = scale_A * C_recip
+    b_scale_f = scale_B * C_recip
+    scale_fixed_point_a, rsh_a = get_fixed_point_value(a_scale_f, "int16")
+    scale_fixed_point_b, rsh_b = get_fixed_point_value(b_scale_f, "int16")
+
+    # Here we have two exp_scale_factors rsh_a and rsh_b.
+    # To avoid complexity, we want to use a common exp_scale_factor and
+    # we want to use the lowest of the two.
+
+    # Since, either of scale_fixed_point_a or scale_fixed_point_b has already been multiplied
+    # by 2^max(rsh_a, rsh_b) in topi.hexagon.utils.get_fixed_point_value,
+    # we want to undo that by right shifting that scale_fixed_point value
+    # by the difference of rsh_a and rsh_b.
+
+    # This results into having a common exp_scale_factor for both scale_fixed_point_a
+    # and scale_fixed_point_b.
+
+    # We also set rsh here which is used to adjust the zero_point_M and compute the corr value,
+    # computation of which comes from the original equation of the op's compute.
+
+    if rsh_a > rsh_b:
+        scale_fixed_point_a = scale_fixed_point_a >> (rsh_a - rsh_b)
+        rsh = rsh_b
+    else:
+        scale_fixed_point_b = scale_fixed_point_b >> (rsh_b - rsh_a)
+        rsh = rsh_a
+
+    if op == "qadd":
+        corr = (zero_point_M << rsh) - (
+            zero_point_A * scale_fixed_point_a + zero_point_B * scale_fixed_point_b
+        )
+    else:
+        corr = (zero_point_M << rsh) - (
+            zero_point_A * scale_fixed_point_a - zero_point_B * scale_fixed_point_b
+        )
+
+    return scale_fixed_point_a, scale_fixed_point_b, rsh, corr
+
+
+def qadd_broadcast_compute(
+    tensor_A: te.Tensor,
+    tensor_B: te.Tensor,
+    output_shape: list,
+    zero_point_A: int,
+    scale_A: float,
+    zero_point_B: int,
+    scale_B: float,
+    zero_point_M: int,
+    scale_M: float,
+    dtype: str,
+):
+    """Compute quantized add with broadcasting"""
+    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
+    n_a, h_a, w_a, c_a = A_broadcast
+    n_b, h_b, w_b, c_b = B_broadcast
+
+    scale_a, scale_b, rsh, corr = get_int_scale(
+        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qadd"
+    )
+
+    return te.compute(
+        output_shape,
+        lambda n, h, w, c: saturate(
+            (
+                (
+                    (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a)
+                    + (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b)
+                    + corr
+                )
+                >> rsh
+            ),
+            dtype,
+        ).astype(dtype),
+    )
+
+
+def qsubtract_broadcast_compute(
+    tensor_A: te.Tensor,
+    tensor_B: te.Tensor,
+    output_shape: list,
+    zero_point_A: int,
+    scale_A: float,
+    zero_point_B: int,
+    scale_B: float,
+    zero_point_M: int,
+    scale_M: float,
+    dtype: str,
+):
+    """Compute quantized subtract with broadcasting"""
+    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
+    n_a, h_a, w_a, c_a = A_broadcast
+    n_b, h_b, w_b, c_b = B_broadcast
+
+    scale_a, scale_b, rsh, corr = get_int_scale(
+        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qsub"
+    )
+
+    return te.compute(
+        output_shape,
+        lambda n, h, w, c: saturate(
+            (
+                (
+                    (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a)
+                    - (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b)
+                    + corr
+                )
+                >> rsh
+            ),
+            dtype,
+        ).astype(dtype),
+    )
+
+
+def qmultiply_broadcast_compute(
+    tensor_A: te.Tensor,
+    tensor_B: te.Tensor,
+    output_shape: list,
+    zero_point_A: int,
+    scale_A: float,
+    zero_point_B: int,
+    scale_B: float,
+    zero_point_M: int,
+    scale_M: float,
+    dtype: str,
+):
+    """Compute quantized multiply with broadcasting"""
+    A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B)
+    n_a, h_a, w_a, c_a = A_broadcast
+    n_b, h_b, w_b, c_b = B_broadcast
+
+    scale_int, rsh, corr = get_int_scale(
+        scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qmul"
+    )
+
+    return te.compute(
+        output_shape,
+        lambda n, h, w, c: saturate(
+            (
+                (
+                    scale_int
+                    * (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] - zero_point_A)
+                    * (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] - zero_point_B)
+                    + corr
+                )
+                >> rsh
+            ),
+            dtype,
+        ).astype(dtype),
+    )
+
+
+def tir_schedule_quant(
+    out_M: te.Tensor,
+    tensor_A: te.Tensor,
+    tensor_B: te.Tensor,
+    output_layout: str,
+    tensor_A_layout: str,
+    tensor_B_layout: str,
+):
+    """Schedule for output layout nhwc-8h8w32c-2d"""
+    func = te.create_prim_func([tensor_A, tensor_B, out_M])
+
+    s = tir.Schedule(func)
+
+    block = s.get_block("compute")
+
+    if tensor_A_layout == "nhwc-8h8w32c-2d":
+        tensor_A_transformed_layout = get_layout_transform_fn(tensor_A_layout)
+        s.transform_layout(block, buffer=tensor_A.name, index_map=tensor_A_transformed_layout)
+
+    if tensor_B_layout == "nhwc-8h8w32c-2d":
+        tensor_B_transformed_layout = get_layout_transform_fn(tensor_B_layout)
+        s.transform_layout(block, buffer=tensor_B.name, index_map=tensor_B_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout)
+
+    n, h, w, c = s.get_loops(block)
+
+    h_o, h_i = s.split(h, [None, 8])
+    w_o, w_i = s.split(w, [None, 8])
+    c_o, c_i = s.split(c, [None, 32])
+    wio, wii = s.split(w_i, [None, 4])
+
+    s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i)
+
+    return s
diff --git a/python/tvm/topi/hexagon/qnn/quantize.py b/python/tvm/topi/hexagon/qnn/quantize.py
old mode 100755
new mode 100644
index ff03aac0a862..3fd91ddce6ca
--- a/python/tvm/topi/hexagon/qnn/quantize.py
+++ b/python/tvm/topi/hexagon/qnn/quantize.py
@@ -1,80 +1,80 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Compute and schedule for hexagon quantize
-Please note the following assumptions made by the implementation:
-1) The input and output data will be multiple of crouton layout
-2) And the supported layout is NHWC
-3) The input layout will be nhwc-4h2w32c2w-2d and
-   output layout will be nhwc-8h8w32c-2d"""
-
-
-from tvm import te
-from tvm import tir
-from ..utils import get_layout_transform_fn, saturate
-
-
-def quantize_compute(tensor_A: te.Tensor, scale: float, zero_point: int, dtype: str):
-    """Compute for quantize"""
-    scale_recip = 1 / scale
-
-    return te.compute(
-        tensor_A.shape,
-        lambda n, h, w, c: saturate(
-            ((tensor_A[n, h, w, c] * scale_recip).astype("int32") + zero_point),
-            dtype,
-        ).astype(dtype),
-        name="quantize",
-    )
-
-
-def tir_quantize_schedule(
-    out_M: te.Tensor,
-    tensor_A: te.Tensor,
-    input_layout: str,
-    output_layout: str,
-):
-    """Schedule for output layout nhwc-8h8w32c-2d"""
-    func = te.create_prim_func([tensor_A, out_M])
-
-    s = tir.Schedule(func)
-
-    block = s.get_block("quantize")
-
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(block, buffer=tensor_A.name, index_map=input_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout)
-
-    # Fixed chunk size is 2048 byte
-    # For uint8 the layout for fixed chunk is 8x8x32
-    # where each element is 1 bytes
-    # Split and reorder is done to iterate over the fixed chunk
-    # Channel is split by a factor of 32
-    # Width is split by a factor of 8
-    # Height is split by a factor of 8
-    n, h, w, c = s.get_loops(block)
-
-    h_o, h_i = s.split(h, [None, 8])
-    w_o, w_i = s.split(w, [None, 8])
-    c_o, c_i = s.split(c, [None, 32])
-    wio, wii = s.split(w_i, [None, 4])
-
-    s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i)
-
-    return s
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Compute and schedule for hexagon quantize
+Please note the following assumptions made by the implementation:
+1) The input and output data will be multiple of crouton layout
+2) And the supported layout is NHWC
+3) The input layout will be nhwc-4h2w32c2w-2d and
+   output layout will be nhwc-8h8w32c-2d"""
+
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, saturate
+
+
+def quantize_compute(tensor_A: te.Tensor, scale: float, zero_point: int, dtype: str):
+    """Compute for quantize"""
+    scale_recip = 1 / scale
+
+    return te.compute(
+        tensor_A.shape,
+        lambda n, h, w, c: saturate(
+            ((tensor_A[n, h, w, c] * scale_recip).astype("int32") + zero_point),
+            dtype,
+        ).astype(dtype),
+        name="quantize",
+    )
+
+
+def tir_quantize_schedule(
+    out_M: te.Tensor,
+    tensor_A: te.Tensor,
+    input_layout: str,
+    output_layout: str,
+):
+    """Schedule for output layout nhwc-8h8w32c-2d"""
+    func = te.create_prim_func([tensor_A, out_M])
+
+    s = tir.Schedule(func)
+
+    block = s.get_block("quantize")
+
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(block, buffer=tensor_A.name, index_map=input_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout)
+
+    # Fixed chunk size is 2048 byte
+    # For uint8 the layout for fixed chunk is 8x8x32
+    # where each element is 1 bytes
+    # Split and reorder is done to iterate over the fixed chunk
+    # Channel is split by a factor of 32
+    # Width is split by a factor of 8
+    # Height is split by a factor of 8
+    n, h, w, c = s.get_loops(block)
+
+    h_o, h_i = s.split(h, [None, 8])
+    w_o, w_i = s.split(w, [None, 8])
+    c_o, c_i = s.split(c, [None, 32])
+    wio, wii = s.split(w_i, [None, 4])
+
+    s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i)
+
+    return s
diff --git a/python/tvm/topi/hexagon/resize2d.py b/python/tvm/topi/hexagon/resize2d.py
old mode 100755
new mode 100644
index 0e817e2e9330..6e6c0e471db0
--- a/python/tvm/topi/hexagon/resize2d.py
+++ b/python/tvm/topi/hexagon/resize2d.py
@@ -1,116 +1,116 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""Compute and schedule for resize2d
-Please note the following assumptions made by the implementation:
-1) The input and output data will be multiple of crouton layout
-2) And the supported layout is NHWC"""
-
-from tvm import te
-from tvm import tir
-from tvm import topi
-from .utils import get_layout_transform_fn
-
-
-def resize2d_compute(
-    data,
-    roi,
-    size,
-    layout,
-    method="linear",
-    coordinate_transformation_mode="half_pixel",
-    rounding_method="",
-    bicubic_alpha=-0.5,
-    bicubic_exclude=0,
-    extrapolation_value=0.0,
-    out_dtype=None,
-    output_shape=None,
-):
-    """Call resize2d op from topi.image"""
-    return topi.image.resize2d(
-        data,
-        roi,
-        size,
-        layout,
-        method,
-        coordinate_transformation_mode,
-        rounding_method,
-        bicubic_alpha,
-        bicubic_exclude,
-        extrapolation_value,
-        out_dtype,
-        output_shape,
-    )
-
-
-def tir_resize2d_schedule(
-    out_m,
-    input_a,
-    input_layout: str,
-    output_layout: str,
-):
-    """Schedule for input and output layout nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d"""
-    func = te.create_prim_func([input_a, out_m])
-
-    s = tir.Schedule(func)
-
-    block = s.get_block("resize")
-
-    if input_layout in (
-        "nhwc-8h2w32c2w-2d",
-        "nhwc-8h8w32c-2d",
-    ):
-        input_transformed_layout = get_layout_transform_fn(input_layout)
-        s.transform_layout(block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
-
-    if output_layout == "nhwc-8h2w32c2w-2d":
-        # Fixed chunk size is 2048 byte
-        # For fp16 the layout for fixed chunk is 8x4x32
-        # where each element is 2 bytes
-        # Split and reorder is done to iterate over the fixed chunk
-        # Channel is split by a factor of 32
-        # Width is split by a factor of 4
-        # Height is split by a factor of 8
-        n, h, w, c = s.get_loops(block)
-
-        ho, hi = s.split(h, [None, 8])
-        wo, wi = s.split(w, [None, 4])
-        co, ci = s.split(c, [None, 32])
-
-        s.reorder(n, ho, wo, co, hi, wi, ci)
-
-    elif output_layout == "nhwc-8h8w32c-2d":
-        # Fixed chunk size is 2048 byte
-        # For uint8 the layout for fixed chunk is 8x8x32
-        # where each element is 1 bytes
-        # Split and reorder is done to iterate over the fixed chunk
-        # Channel is split by a factor of 32
-        # Width is split by a factor of 8
-        # Height is split by a factor of 8
-        n, h, w, c = s.get_loops(block)
-
-        ho, hi = s.split(h, [None, 8])
-        wo, wi = s.split(w, [None, 8])
-        co, ci = s.split(c, [None, 32])
-
-        s.reorder(n, ho, wo, co, hi, wi, ci)
-
-    return s
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Compute and schedule for resize2d
+Please note the following assumptions made by the implementation:
+1) The input and output data will be multiple of crouton layout
+2) And the supported layout is NHWC"""
+
+from tvm import te
+from tvm import tir
+from tvm import topi
+from .utils import get_layout_transform_fn
+
+
+def resize2d_compute(
+    data,
+    roi,
+    size,
+    layout,
+    method="linear",
+    coordinate_transformation_mode="half_pixel",
+    rounding_method="",
+    bicubic_alpha=-0.5,
+    bicubic_exclude=0,
+    extrapolation_value=0.0,
+    out_dtype=None,
+    output_shape=None,
+):
+    """Call resize2d op from topi.image"""
+    return topi.image.resize2d(
+        data,
+        roi,
+        size,
+        layout,
+        method,
+        coordinate_transformation_mode,
+        rounding_method,
+        bicubic_alpha,
+        bicubic_exclude,
+        extrapolation_value,
+        out_dtype,
+        output_shape,
+    )
+
+
+def tir_resize2d_schedule(
+    out_m,
+    input_a,
+    input_layout: str,
+    output_layout: str,
+):
+    """Schedule for input and output layout nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d"""
+    func = te.create_prim_func([input_a, out_m])
+
+    s = tir.Schedule(func)
+
+    block = s.get_block("resize")
+
+    if input_layout in (
+        "nhwc-8h2w32c2w-2d",
+        "nhwc-8h8w32c-2d",
+    ):
+        input_transformed_layout = get_layout_transform_fn(input_layout)
+        s.transform_layout(block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
+
+    if output_layout == "nhwc-8h2w32c2w-2d":
+        # Fixed chunk size is 2048 byte
+        # For fp16 the layout for fixed chunk is 8x4x32
+        # where each element is 2 bytes
+        # Split and reorder is done to iterate over the fixed chunk
+        # Channel is split by a factor of 32
+        # Width is split by a factor of 4
+        # Height is split by a factor of 8
+        n, h, w, c = s.get_loops(block)
+
+        ho, hi = s.split(h, [None, 8])
+        wo, wi = s.split(w, [None, 4])
+        co, ci = s.split(c, [None, 32])
+
+        s.reorder(n, ho, wo, co, hi, wi, ci)
+
+    elif output_layout == "nhwc-8h8w32c-2d":
+        # Fixed chunk size is 2048 byte
+        # For uint8 the layout for fixed chunk is 8x8x32
+        # where each element is 1 bytes
+        # Split and reorder is done to iterate over the fixed chunk
+        # Channel is split by a factor of 32
+        # Width is split by a factor of 8
+        # Height is split by a factor of 8
+        n, h, w, c = s.get_loops(block)
+
+        ho, hi = s.split(h, [None, 8])
+        wo, wi = s.split(w, [None, 8])
+        co, ci = s.split(c, [None, 32])
+
+        s.reorder(n, ho, wo, co, hi, wi, ci)
+
+    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py b/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py
old mode 100755
new mode 100644
index 86b6adb997cb..0596f79b66a8
--- a/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py
+++ b/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py
@@ -1,87 +1,87 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-
-"""Compute and schedule for add, multiply, subtract slice op
-
-Please note the following assumptions made by the implementation:
-
-1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting."""
-
-from tvm import te
-from tvm import tir
-from tvm import topi
-from ..utils import get_layout_transform_fn
-
-
-def add_broadcast_compute(input_a, input_b):
-    """Call the add op from topi"""
-    return topi.add(input_a, input_b)
-
-
-def subtract_broadcast_compute(input_a, input_b):
-    """Call the subtract op from topi"""
-    return topi.subtract(input_a, input_b)
-
-
-def multiply_broadcast_compute(input_a, input_b):
-    """Call the multiply op from topi"""
-    return topi.multiply(input_a, input_b)
-
-
-def tir_broadcast_schedule(
-    out_m,
-    input_a,
-    input_b,
-    output_layout: str,
-    input_a_layout: str,
-    input_b_layout: str,
-    op_name: str,
-):
-    """Schedule for input and output layout nhwc-8h2w32c2w-2d considering broadcast"""
-    func = te.create_prim_func([input_a, input_b, out_m])
-
-    s = tir.Schedule(func)
-
-    block_dict = {"add": "T_add", "subtract": "T_subtract", "multiply": "T_multiply"}
-
-    block = s.get_block(block_dict[op_name])
-
-    if input_a_layout == "nhwc-8h2w32c2w-2d":
-        input_a_transformed_layout = get_layout_transform_fn(input_a_layout)
-        s.transform_layout(block, buffer=("read", 0), index_map=input_a_transformed_layout)
-
-    if input_b_layout == "nhwc-8h2w32c2w-2d":
-        input_b_transformed_layout = get_layout_transform_fn(input_b_layout)
-        s.transform_layout(block, buffer=("read", 1), index_map=input_b_transformed_layout)
-
-    output_transformed_layout = get_layout_transform_fn(output_layout)
-    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
-
-    n, h, w, c = s.get_loops(block)
-
-    h_o, h_i = s.split(h, [None, 8])
-    w_o, w_i = s.split(w, [None, 4])
-    c_o, c_i = s.split(c, [None, 32])
-    wio, wii = s.split(w_i, [None, 2])
-
-    s.reorder(n, h_o, w_o, c_o, h_i, wio, c_i, wii)
-
-    fused = s.fuse(c_i, wii)
-    s.vectorize(fused)
-
-    return s
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Compute and schedule for add, multiply, subtract slice op
+
+Please note the following assumptions made by the implementation:
+
+1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting."""
+
+from tvm import te
+from tvm import tir
+from tvm import topi
+from ..utils import get_layout_transform_fn
+
+
+def add_broadcast_compute(input_a, input_b):
+    """Call the add op from topi"""
+    return topi.add(input_a, input_b)
+
+
+def subtract_broadcast_compute(input_a, input_b):
+    """Call the subtract op from topi"""
+    return topi.subtract(input_a, input_b)
+
+
+def multiply_broadcast_compute(input_a, input_b):
+    """Call the multiply op from topi"""
+    return topi.multiply(input_a, input_b)
+
+
+def tir_broadcast_schedule(
+    out_m,
+    input_a,
+    input_b,
+    output_layout: str,
+    input_a_layout: str,
+    input_b_layout: str,
+    op_name: str,
+):
+    """Schedule for input and output layout nhwc-8h2w32c2w-2d considering broadcast"""
+    func = te.create_prim_func([input_a, input_b, out_m])
+
+    s = tir.Schedule(func)
+
+    block_dict = {"add": "T_add", "subtract": "T_subtract", "multiply": "T_multiply"}
+
+    block = s.get_block(block_dict[op_name])
+
+    if input_a_layout == "nhwc-8h2w32c2w-2d":
+        input_a_transformed_layout = get_layout_transform_fn(input_a_layout)
+        s.transform_layout(block, buffer=("read", 0), index_map=input_a_transformed_layout)
+
+    if input_b_layout == "nhwc-8h2w32c2w-2d":
+        input_b_transformed_layout = get_layout_transform_fn(input_b_layout)
+        s.transform_layout(block, buffer=("read", 1), index_map=input_b_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
+
+    n, h, w, c = s.get_loops(block)
+
+    h_o, h_i = s.split(h, [None, 8])
+    w_o, w_i = s.split(w, [None, 4])
+    c_o, c_i = s.split(c, [None, 32])
+    wio, wii = s.split(w_i, [None, 2])
+
+    s.reorder(n, h_o, w_o, c_o, h_i, wio, c_i, wii)
+
+    fused = s.fuse(c_i, wii)
+    s.vectorize(fused)
+
+    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/clip.py b/python/tvm/topi/hexagon/slice_ops/clip.py
old mode 100755
new mode 100644
diff --git a/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
old mode 100755
new mode 100644
index 30222c11bb54..9e6ae077851e
--- a/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
+++ b/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
@@ -1,52 +1,52 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Assumptions:
-1) The input is in NCHW layout. Squeezenet is the only model that calls
-   nn.global_avg_pool2d and the only layout it uses is 'NCHW'.
-2) The op takes input data as an argument.
-3) Both input and output dtype is float32 and
-4) Input is assumed to always be multiple of fixed chunk 32c8h4w.
-"""
-
-from tvm import te
-from tvm import tir
-from tvm import topi
-from ..utils import get_layout_transform_fn
-
-
-def global_avg_pool2d(
-    data: te.Tensor,
-):
-    """global_avg_pool2d"""
-    return topi.nn.global_pool(data, "avg", "NCHW")
-
-
-def stir_global_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, input_layout: str):
-    """Schedule"""
-    func = te.create_prim_func([ins, outs])
-    s = tir.Schedule(func)
-
-    sum_block = s.get_block("adaptive_pool_sum")
-
-    # Input is multiple of fixed chunk but output is NxCx1x1
-    # Hence transform_layout is only applied on input
-    input_transformed_layout = get_layout_transform_fn(input_layout)
-    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
-
-    return s
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Assumptions:
+1) The input is in NCHW layout. Squeezenet is the only model that calls
+   nn.global_avg_pool2d and the only layout it uses is 'NCHW'.
+2) The op takes input data as an argument.
+3) Both input and output dtype is float32 and
+4) Input is assumed to always be multiple of fixed chunk 32c8h4w.
+"""
+
+from tvm import te
+from tvm import tir
+from tvm import topi
+from ..utils import get_layout_transform_fn
+
+
+def global_avg_pool2d(
+    data: te.Tensor,
+):
+    """global_avg_pool2d"""
+    return topi.nn.global_pool(data, "avg", "NCHW")
+
+
+def stir_global_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, input_layout: str):
+    """Schedule"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+
+    sum_block = s.get_block("adaptive_pool_sum")
+
+    # Input is multiple of fixed chunk but output is NxCx1x1
+    # Hence transform_layout is only applied on input
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    return s
diff --git a/src/relay/backend/contrib/dnnl/query_layout.cc b/src/relay/backend/contrib/dnnl/query_layout.cc
index 3762c1906f40..63e0d73ce229 100755
--- a/src/relay/backend/contrib/dnnl/query_layout.cc
+++ b/src/relay/backend/contrib/dnnl/query_layout.cc
@@ -1,379 +1,379 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/relay/backend/contrib/dnnl/query_layout.cc
- * \brief layout auto-query func.
- */
-
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-
-#include <fstream>
-#include <numeric>
-#include <regex>
-#include <sstream>
-
-#include "../../../../runtime/contrib/dnnl/dnnl_utils.h"
-#include "../../utils.h"
-#include "dnnl.hpp"
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-using dim_t = dnnl_dim_t;
-using dims_t = dnnl_dims_t;
-using tvm::runtime::contrib::dtype_dl2dnnl;
-
-template <typename T, typename U>
-inline void array_set(T* arr, const U& val, size_t size) {
-  for (size_t i = 0; i < size; ++i) arr[i] = static_cast<T>(val);
-}
-
-template <typename T>
-inline void array_copy(T* dst, const T* src, size_t size) {
-  for (size_t i = 0; i < size; ++i) dst[i] = src[i];
-}
-
-template <typename T>
-inline void swap(T& t1, T& t2) {
-  T tmp(t1);
-  t1 = t2;
-  t2 = tmp;
-}
-
-template <typename T, typename U, typename F>
-inline void simultaneous_sort(T* vals, T* vals_2nd_level, U* keys, size_t size, F comparator) {
-  if (size == 0) return;
-
-  for (size_t i = 0; i < size - 1; ++i) {
-    bool swapped = false;
-
-    for (size_t j = 0; j < size - i - 1; j++) {
-      auto res = comparator(vals[j], vals[j + 1]);
-      if (res == 0) res = comparator(vals_2nd_level[j], vals_2nd_level[j + 1]);
-
-      if (res > 0) {
-        swap(vals[j], vals[j + 1]);
-        swap(vals_2nd_level[j], vals_2nd_level[j + 1]);
-        swap(keys[j], keys[j + 1]);
-        swapped = true;
-      }
-    }
-
-    if (swapped == false) break;
-  }
-}
-
-void compute_blocks(dims_t blocks, const dnnl::memory::desc* md) {
-  using format_kind_t = dnnl_format_kind_t;
-  const format_kind_t blocked = dnnl_blocked;
-  if (!(md->data.format_kind == blocked)) {
-    array_set(blocks, 0, md->data.ndims);
-    return;
-  }
-  array_set(blocks, 1, md->data.ndims);
-  const auto& bd = md->data.format_desc.blocking;
-  for (int iblk = 0; iblk < bd.inner_nblks; ++iblk)
-    blocks[bd.inner_idxs[iblk]] *= bd.inner_blks[iblk];
-}
-
-inline bool has_runtime_strides(const dnnl::memory::desc* md) {
-  using format_kind_t = dnnl_format_kind_t;
-  const format_kind_t blocked = dnnl_blocked;
-  if (!(md->data.format_kind == blocked)) return false;
-  for (int d = 0; d < md->data.ndims; ++d)
-    if (md->data.format_desc.blocking.strides[d] == DNNL_RUNTIME_DIM_VAL) return true;
-  return false;
-}
-
-std::string md2fmt_tag_str(const dnnl::memory::desc* md) {
-  const auto& blk = md->data.format_desc.blocking;
-
-  dims_t blocks = {0};
-  compute_blocks(blocks, md);
-
-  char dim_chars[DNNL_MAX_NDIMS + 1];
-
-  dims_t ou_blocks = {0};
-  array_copy(ou_blocks, md->data.padded_dims, md->data.ndims);
-
-  bool plain = true;
-  for (int d = 0; d < md->data.ndims; ++d) {
-    dim_chars[d] = (blocks[d] == 1 ? 'a' : 'A') + static_cast<char>(d);
-    if (blocks[d] != 1) plain = false;
-    ou_blocks[d] /= blocks[d];
-  }
-
-  // Can't report meaningful tag for runtime dimensions.
-  if (has_runtime_strides(md)) return "*";
-
-  dims_t strides;
-  array_copy(strides, blk.strides, md->data.ndims);
-
-  simultaneous_sort(strides, ou_blocks, dim_chars, md->data.ndims,
-                    [](dim_t a, dim_t b) { return b - a; });
-
-  dim_chars[md->data.ndims] = '\0';
-
-  std::string s(dim_chars);
-
-  if (!plain) {
-    for (int iblk = 0; iblk < blk.inner_nblks; ++iblk) {
-      char c = ('a' + static_cast<char>(blk.inner_idxs[iblk]));
-      s += (std::to_string(blk.inner_blks[iblk]) + c);
-    }
-  }
-  return s;
-}
-
-dnnl::memory::dims str2dims(const std::string& str_shape, bool dilates = false,
-                            std::string interval = ",") {
-  // Split strings
-  std::vector<std::string> str_dims;
-  size_t pos = 0, start = 0;
-  while ((pos = str_shape.find(interval, start)) != std::string::npos) {
-    std::string str_dim = str_shape.substr(start, pos - start);
-    if (pos > start) str_dims.push_back(str_dim);
-    start = pos + interval.size();
-  }
-  if (str_shape.size() > start) {
-    str_dims.push_back(str_shape.substr(start));
-  }
-  // transfer string to dims
-  dnnl::memory::dims out_dims;
-  if (dilates) {
-    std::transform(str_dims.begin(), str_dims.end(), std::back_inserter(out_dims),
-                   [](const std::string& str) { return std::stoi(str) - 1; });
-  } else {
-    std::transform(str_dims.begin(), str_dims.end(), std::back_inserter(out_dims),
-                   [](const std::string& str) { return std::stoi(str); });
-  }
-  return out_dims;
-}
-
-void check_shapes(const std::vector<std::string> shapes) {
-  std::regex valid_pat("(\\d*)(,(\\d*))*");
-  bool checked = std::regex_match(shapes[0], valid_pat);
-  for (size_t i = 1; i < shapes.size() - 1; i++) {
-    checked &= std::regex_match(shapes[i], valid_pat);
-  }
-  checked &= std::regex_match(shapes[shapes.size() - 1], std::regex("\\d*"));
-  if (!checked) {
-    LOG(FATAL) << "Invalid input args for query dnnl optimal layout.";
-  }
-}
-
-void check_layout(bool var, bool ref) {
-  if (var != ref) {
-    LOG(FATAL) << "Invalid input layout for query dnnl optimal layout.";
-  }
-}
-
-std::string get_optimal_layout_for_conv(std::string data_layout, std::string kernel_layout,
-                                        std::string weight_shape, std::string out_shape,
-                                        std::string paddings, std::string strides,
-                                        std::string dilates, std::string G, std::string dtype) {
-  check_layout(std::regex_match(data_layout, std::regex("NC(D?)(H?)W")), true);
-  check_layout(std::regex_match(kernel_layout, std::regex("(G?)OI(D?)(H?)W")), true);
-  check_shapes({weight_shape, out_shape, paddings, strides, dilates, G});
-
-  dnnl::engine eng(dnnl::engine::kind::cpu, 0);
-  dnnl::stream s(eng);
-  using tag = dnnl::memory::format_tag;
-
-  dnnl::memory::dim groups = std::stoi(G);
-  dnnl::memory::dims weight_dims_ = str2dims(weight_shape);
-  dnnl::memory::dims weight_dims = weight_dims_;
-
-  if (groups > 1) {
-    if (weight_dims_.size() == 5) {
-      weight_dims = {groups * weight_dims_[1], groups * weight_dims_[2], weight_dims_[3],
-                     weight_dims_[4]};
-    } else {
-      weight_dims[1] = weight_dims[1] * groups;
-    }
-  }
-
-  dnnl::memory::dims out_dims = str2dims(out_shape);
-  dnnl::memory::dims padding_dims = str2dims(paddings);
-  dnnl::memory::dims padding_dims_l(padding_dims.begin(),
-                                    padding_dims.begin() + padding_dims.size() / 2);
-  dnnl::memory::dims padding_dims_r(padding_dims.end() - padding_dims.size() / 2,
-                                    padding_dims.end());
-  dnnl::memory::dims strides_dims = str2dims(strides);
-  dnnl::memory::dims dilates_dims = str2dims(dilates, true);
-
-  dnnl::memory::dims input_dims = out_dims;
-  input_dims[1] = weight_dims[1];
-  for (size_t i = 2; i < out_dims.size(); i++) {
-    dnnl::memory::dim K = weight_dims[i];
-    dnnl::memory::dim S = strides_dims[i - 2];
-    dnnl::memory::dim D = dilates_dims[i - 2];
-    dnnl::memory::dim PL = padding_dims_l[i - 2];
-    dnnl::memory::dim PR = padding_dims_r[i - 2];
-    dnnl::memory::dim DK = 1 + (K - 1) * (D + 1);
-    input_dims[i] = out_dims[i] * S - PL - PR + DK - 1;
-  }
-
-  dnnl::memory::dims conv_src_dims = input_dims;
-  dnnl::memory::dims conv_weights_dims = weight_dims;
-  if (groups > 1) {
-    conv_weights_dims = {groups, out_dims[1] / groups, input_dims[1] / groups};
-    conv_weights_dims.insert(conv_weights_dims.end(), weight_dims.begin() + 2, weight_dims.end());
-  }
-
-  dnnl::memory::dims conv_dst_dims = out_dims;
-  dnnl::memory::dims conv_strides = strides_dims;
-  dnnl::memory::dims conv_dilates = dilates_dims;
-  dnnl::memory::dims conv_padding_l = padding_dims_l;
-  dnnl::memory::dims conv_padding_r = padding_dims_r;
-
-  auto dnnl_dtype = dtype_dl2dnnl(tvm::runtime::String2DLDataType(dtype));
-  auto conv_src_md = dnnl::memory::desc({conv_src_dims}, dnnl_dtype, tag::any);
-  auto conv_weights_md = dnnl::memory::desc({conv_weights_dims}, dnnl_dtype, tag::any);
-  auto conv_dst_md = dnnl::memory::desc({conv_dst_dims}, dnnl_dtype, tag::any);
-
-  auto conv_desc = dnnl::convolution_forward::desc(
-      dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct, conv_src_md,
-      conv_weights_md, conv_dst_md, conv_strides, conv_dilates, conv_padding_l, conv_padding_r);
-
-  auto conv_prim_desc = dnnl::convolution_forward::primitive_desc(conv_desc, eng);
-
-  auto src_format = conv_prim_desc.src_desc();
-  auto weights_format = conv_prim_desc.weights_desc();
-  auto dst_format = conv_prim_desc.dst_desc();
-  std::string src_df, weight_df, dst_df;
-
-  src_df = md2fmt_tag_str(&src_format);
-  weight_df = md2fmt_tag_str(&weights_format);
-  dst_df = md2fmt_tag_str(&dst_format);
-  std::string res = src_df + "," + weight_df + "," + dst_df;
-  return res;
-}
-
-std::string get_optimal_layout_for_conv_transpose(std::string data_layout,
-                                                  std::string kernel_layout,
-                                                  std::string weight_shape, std::string out_shape,
-                                                  std::string paddings, std::string output_paddings,
-                                                  std::string strides, std::string dilates,
-                                                  std::string G, std::string dtype) {
-  check_layout(std::regex_match(data_layout, std::regex("NC(D?)(H?)W")), true);
-  check_layout(std::regex_match(kernel_layout, std::regex("(G?)((IO)|(OI))(D?)(H?)W")), true);
-  check_shapes({weight_shape, out_shape, paddings, output_paddings, strides, dilates, G});
-
-  dnnl::engine eng(dnnl::engine::kind::cpu, 0);
-  dnnl::stream s(eng);
-  using tag = dnnl::memory::format_tag;
-
-  dnnl::memory::dim groups = std::stoi(G);
-  dnnl::memory::dims weight_dims_ = str2dims(weight_shape);
-  dnnl::memory::dims weight_dims = weight_dims_;
-  if (groups > 1) {
-    if (weight_dims_.size() == 5) {
-      weight_dims = {groups * weight_dims_[1], groups * weight_dims_[2], weight_dims_[3],
-                     weight_dims_[4]};
-    } else {
-      weight_dims[1] = weight_dims[1] * groups;
-    }
-  }
-  dnnl::memory::dims out_dims = str2dims(out_shape);
-  dnnl::memory::dims padding_dims = str2dims(paddings);
-  dnnl::memory::dims padding_dims_l(padding_dims.begin(),
-                                    padding_dims.begin() + padding_dims.size() / 2);
-  dnnl::memory::dims padding_dims_r(padding_dims.end() - padding_dims.size() / 2,
-                                    padding_dims.end());
-  dnnl::memory::dims output_padding_dims = str2dims(output_paddings);
-  dnnl::memory::dims strides_dims = str2dims(strides);
-  dnnl::memory::dims dilates_dims = str2dims(dilates, true);
-
-  dnnl::memory::dims input_dims = out_dims;
-  if (out_dims[1] == weight_dims[0]) {
-    input_dims[1] = weight_dims[1];
-  } else {
-    input_dims[1] = weight_dims[0];
-    std::swap(weight_dims[0], weight_dims[1]);
-  }
-  for (size_t i = 2; i < out_dims.size(); i++) {
-    dnnl::memory::dim K = weight_dims[i];
-    dnnl::memory::dim S = strides_dims[i - 2];
-    dnnl::memory::dim D = dilates_dims[i - 2];
-    dnnl::memory::dim PL = padding_dims_l[i - 2];
-    dnnl::memory::dim PR = padding_dims_r[i - 2];
-    dnnl::memory::dim OP = output_padding_dims[i - 2];
-    dnnl::memory::dim DK = 1 + (K - 1) * (D + 1);
-    input_dims[i] = (out_dims[i] - DK + PL + PR - OP) / S + 1;
-  }
-
-  dnnl::memory::dims deconv_src_dims = input_dims;
-  dnnl::memory::dims deconv_weights_dims = weight_dims;
-  if (groups > 1) {
-    deconv_weights_dims = {groups, out_dims[1] / groups, input_dims[1] / groups};
-    deconv_weights_dims.insert(deconv_weights_dims.end(), weight_dims.begin() + 2,
-                               weight_dims.end());
-  }
-  dnnl::memory::dims deconv_dst_dims = out_dims;
-  dnnl::memory::dims deconv_strides = strides_dims;
-  dnnl::memory::dims deconv_dilates = dilates_dims;
-  dnnl::memory::dims deconv_padding_l = padding_dims_l;
-  dnnl::memory::dims deconv_padding_r = padding_dims_r;
-
-  auto dnnl_dtype = dtype_dl2dnnl(tvm::runtime::String2DLDataType(dtype));
-  auto deconv_src_md = dnnl::memory::desc({deconv_src_dims}, dnnl_dtype, tag::any);
-  auto deconv_weights_md = dnnl::memory::desc({deconv_weights_dims}, dnnl_dtype, tag::any);
-  auto deconv_dst_md = dnnl::memory::desc({deconv_dst_dims}, dnnl_dtype, tag::any);
-
-  auto deconv_desc = dnnl::deconvolution_forward::desc(
-      dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct, deconv_src_md,
-      deconv_weights_md, deconv_dst_md, deconv_strides, deconv_dilates, deconv_padding_l,
-      deconv_padding_r);
-
-  auto deconv_prim_desc = dnnl::deconvolution_forward::primitive_desc(deconv_desc, eng);
-
-  auto src_format = deconv_prim_desc.src_desc();
-  auto weights_format = deconv_prim_desc.weights_desc();
-  auto dst_format = deconv_prim_desc.dst_desc();
-  std::string src_df, weight_df, dst_df;
-
-  src_df = md2fmt_tag_str(&src_format);
-  weight_df = md2fmt_tag_str(&weights_format);
-  dst_df = md2fmt_tag_str(&dst_format);
-  std::string res = src_df + "," + weight_df + "," + dst_df;
-  return res;
-}
-
-TVM_REGISTER_GLOBAL("relay.ir.get_optimal_layout_for_conv")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      *rv = get_optimal_layout_for_conv(args[0], args[1], args[2], args[3], args[4], args[5],
-                                        args[6], args[7], args[8]);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ir.get_optimal_layout_for_conv_transpose")
-    .set_body([](TVMArgs args, TVMRetValue* rv) {
-      *rv = get_optimal_layout_for_conv_transpose(args[0], args[1], args[2], args[3], args[4],
-                                                  args[5], args[6], args[7], args[8], args[9]);
-    });
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/dnnl/query_layout.cc
+ * \brief layout auto-query func.
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <numeric>
+#include <regex>
+#include <sstream>
+
+#include "../../../../runtime/contrib/dnnl/dnnl_utils.h"
+#include "../../utils.h"
+#include "dnnl.hpp"
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+using dim_t = dnnl_dim_t;
+using dims_t = dnnl_dims_t;
+using tvm::runtime::contrib::dtype_dl2dnnl;
+
+template <typename T, typename U>
+inline void array_set(T* arr, const U& val, size_t size) {
+  for (size_t i = 0; i < size; ++i) arr[i] = static_cast<T>(val);
+}
+
+template <typename T>
+inline void array_copy(T* dst, const T* src, size_t size) {
+  for (size_t i = 0; i < size; ++i) dst[i] = src[i];
+}
+
+template <typename T>
+inline void swap(T& t1, T& t2) {
+  T tmp(t1);
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename T, typename U, typename F>
+inline void simultaneous_sort(T* vals, T* vals_2nd_level, U* keys, size_t size, F comparator) {
+  if (size == 0) return;
+
+  for (size_t i = 0; i < size - 1; ++i) {
+    bool swapped = false;
+
+    for (size_t j = 0; j < size - i - 1; j++) {
+      auto res = comparator(vals[j], vals[j + 1]);
+      if (res == 0) res = comparator(vals_2nd_level[j], vals_2nd_level[j + 1]);
+
+      if (res > 0) {
+        swap(vals[j], vals[j + 1]);
+        swap(vals_2nd_level[j], vals_2nd_level[j + 1]);
+        swap(keys[j], keys[j + 1]);
+        swapped = true;
+      }
+    }
+
+    if (swapped == false) break;
+  }
+}
+
+void compute_blocks(dims_t blocks, const dnnl::memory::desc* md) {
+  using format_kind_t = dnnl_format_kind_t;
+  const format_kind_t blocked = dnnl_blocked;
+  if (!(md->data.format_kind == blocked)) {
+    array_set(blocks, 0, md->data.ndims);
+    return;
+  }
+  array_set(blocks, 1, md->data.ndims);
+  const auto& bd = md->data.format_desc.blocking;
+  for (int iblk = 0; iblk < bd.inner_nblks; ++iblk)
+    blocks[bd.inner_idxs[iblk]] *= bd.inner_blks[iblk];
+}
+
+inline bool has_runtime_strides(const dnnl::memory::desc* md) {
+  using format_kind_t = dnnl_format_kind_t;
+  const format_kind_t blocked = dnnl_blocked;
+  if (!(md->data.format_kind == blocked)) return false;
+  for (int d = 0; d < md->data.ndims; ++d)
+    if (md->data.format_desc.blocking.strides[d] == DNNL_RUNTIME_DIM_VAL) return true;
+  return false;
+}
+
+std::string md2fmt_tag_str(const dnnl::memory::desc* md) {
+  const auto& blk = md->data.format_desc.blocking;
+
+  dims_t blocks = {0};
+  compute_blocks(blocks, md);
+
+  char dim_chars[DNNL_MAX_NDIMS + 1];
+
+  dims_t ou_blocks = {0};
+  array_copy(ou_blocks, md->data.padded_dims, md->data.ndims);
+
+  bool plain = true;
+  for (int d = 0; d < md->data.ndims; ++d) {
+    dim_chars[d] = (blocks[d] == 1 ? 'a' : 'A') + static_cast<char>(d);
+    if (blocks[d] != 1) plain = false;
+    ou_blocks[d] /= blocks[d];
+  }
+
+  // Can't report meaningful tag for runtime dimensions.
+  if (has_runtime_strides(md)) return "*";
+
+  dims_t strides;
+  array_copy(strides, blk.strides, md->data.ndims);
+
+  simultaneous_sort(strides, ou_blocks, dim_chars, md->data.ndims,
+                    [](dim_t a, dim_t b) { return b - a; });
+
+  dim_chars[md->data.ndims] = '\0';
+
+  std::string s(dim_chars);
+
+  if (!plain) {
+    for (int iblk = 0; iblk < blk.inner_nblks; ++iblk) {
+      char c = ('a' + static_cast<char>(blk.inner_idxs[iblk]));
+      s += (std::to_string(blk.inner_blks[iblk]) + c);
+    }
+  }
+  return s;
+}
+
+dnnl::memory::dims str2dims(const std::string& str_shape, bool dilates = false,
+                            std::string interval = ",") {
+  // Split strings
+  std::vector<std::string> str_dims;
+  size_t pos = 0, start = 0;
+  while ((pos = str_shape.find(interval, start)) != std::string::npos) {
+    std::string str_dim = str_shape.substr(start, pos - start);
+    if (pos > start) str_dims.push_back(str_dim);
+    start = pos + interval.size();
+  }
+  if (str_shape.size() > start) {
+    str_dims.push_back(str_shape.substr(start));
+  }
+  // transfer string to dims
+  dnnl::memory::dims out_dims;
+  if (dilates) {
+    std::transform(str_dims.begin(), str_dims.end(), std::back_inserter(out_dims),
+                   [](const std::string& str) { return std::stoi(str) - 1; });
+  } else {
+    std::transform(str_dims.begin(), str_dims.end(), std::back_inserter(out_dims),
+                   [](const std::string& str) { return std::stoi(str); });
+  }
+  return out_dims;
+}
+
+void check_shapes(const std::vector<std::string> shapes) {
+  std::regex valid_pat("(\\d*)(,(\\d*))*");
+  bool checked = std::regex_match(shapes[0], valid_pat);
+  for (size_t i = 1; i < shapes.size() - 1; i++) {
+    checked &= std::regex_match(shapes[i], valid_pat);
+  }
+  checked &= std::regex_match(shapes[shapes.size() - 1], std::regex("\\d*"));
+  if (!checked) {
+    LOG(FATAL) << "Invalid input args for query dnnl optimal layout.";
+  }
+}
+
+void check_layout(bool var, bool ref) {
+  if (var != ref) {
+    LOG(FATAL) << "Invalid input layout for query dnnl optimal layout.";
+  }
+}
+
+std::string get_optimal_layout_for_conv(std::string data_layout, std::string kernel_layout,
+                                        std::string weight_shape, std::string out_shape,
+                                        std::string paddings, std::string strides,
+                                        std::string dilates, std::string G, std::string dtype) {
+  check_layout(std::regex_match(data_layout, std::regex("NC(D?)(H?)W")), true);
+  check_layout(std::regex_match(kernel_layout, std::regex("(G?)OI(D?)(H?)W")), true);
+  check_shapes({weight_shape, out_shape, paddings, strides, dilates, G});
+
+  dnnl::engine eng(dnnl::engine::kind::cpu, 0);
+  dnnl::stream s(eng);
+  using tag = dnnl::memory::format_tag;
+
+  dnnl::memory::dim groups = std::stoi(G);
+  dnnl::memory::dims weight_dims_ = str2dims(weight_shape);
+  dnnl::memory::dims weight_dims = weight_dims_;
+
+  if (groups > 1) {
+    if (weight_dims_.size() == 5) {
+      weight_dims = {groups * weight_dims_[1], groups * weight_dims_[2], weight_dims_[3],
+                     weight_dims_[4]};
+    } else {
+      weight_dims[1] = weight_dims[1] * groups;
+    }
+  }
+
+  dnnl::memory::dims out_dims = str2dims(out_shape);
+  dnnl::memory::dims padding_dims = str2dims(paddings);
+  dnnl::memory::dims padding_dims_l(padding_dims.begin(),
+                                    padding_dims.begin() + padding_dims.size() / 2);
+  dnnl::memory::dims padding_dims_r(padding_dims.end() - padding_dims.size() / 2,
+                                    padding_dims.end());
+  dnnl::memory::dims strides_dims = str2dims(strides);
+  dnnl::memory::dims dilates_dims = str2dims(dilates, true);
+
+  dnnl::memory::dims input_dims = out_dims;
+  input_dims[1] = weight_dims[1];
+  for (size_t i = 2; i < out_dims.size(); i++) {
+    dnnl::memory::dim K = weight_dims[i];
+    dnnl::memory::dim S = strides_dims[i - 2];
+    dnnl::memory::dim D = dilates_dims[i - 2];
+    dnnl::memory::dim PL = padding_dims_l[i - 2];
+    dnnl::memory::dim PR = padding_dims_r[i - 2];
+    dnnl::memory::dim DK = 1 + (K - 1) * (D + 1);
+    input_dims[i] = out_dims[i] * S - PL - PR + DK - 1;
+  }
+
+  dnnl::memory::dims conv_src_dims = input_dims;
+  dnnl::memory::dims conv_weights_dims = weight_dims;
+  if (groups > 1) {
+    conv_weights_dims = {groups, out_dims[1] / groups, input_dims[1] / groups};
+    conv_weights_dims.insert(conv_weights_dims.end(), weight_dims.begin() + 2, weight_dims.end());
+  }
+
+  dnnl::memory::dims conv_dst_dims = out_dims;
+  dnnl::memory::dims conv_strides = strides_dims;
+  dnnl::memory::dims conv_dilates = dilates_dims;
+  dnnl::memory::dims conv_padding_l = padding_dims_l;
+  dnnl::memory::dims conv_padding_r = padding_dims_r;
+
+  auto dnnl_dtype = dtype_dl2dnnl(tvm::runtime::String2DLDataType(dtype));
+  auto conv_src_md = dnnl::memory::desc({conv_src_dims}, dnnl_dtype, tag::any);
+  auto conv_weights_md = dnnl::memory::desc({conv_weights_dims}, dnnl_dtype, tag::any);
+  auto conv_dst_md = dnnl::memory::desc({conv_dst_dims}, dnnl_dtype, tag::any);
+
+  auto conv_desc = dnnl::convolution_forward::desc(
+      dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct, conv_src_md,
+      conv_weights_md, conv_dst_md, conv_strides, conv_dilates, conv_padding_l, conv_padding_r);
+
+  auto conv_prim_desc = dnnl::convolution_forward::primitive_desc(conv_desc, eng);
+
+  auto src_format = conv_prim_desc.src_desc();
+  auto weights_format = conv_prim_desc.weights_desc();
+  auto dst_format = conv_prim_desc.dst_desc();
+  std::string src_df, weight_df, dst_df;
+
+  src_df = md2fmt_tag_str(&src_format);
+  weight_df = md2fmt_tag_str(&weights_format);
+  dst_df = md2fmt_tag_str(&dst_format);
+  std::string res = src_df + "," + weight_df + "," + dst_df;
+  return res;
+}
+
+std::string get_optimal_layout_for_conv_transpose(std::string data_layout,
+                                                  std::string kernel_layout,
+                                                  std::string weight_shape, std::string out_shape,
+                                                  std::string paddings, std::string output_paddings,
+                                                  std::string strides, std::string dilates,
+                                                  std::string G, std::string dtype) {
+  check_layout(std::regex_match(data_layout, std::regex("NC(D?)(H?)W")), true);
+  check_layout(std::regex_match(kernel_layout, std::regex("(G?)((IO)|(OI))(D?)(H?)W")), true);
+  check_shapes({weight_shape, out_shape, paddings, output_paddings, strides, dilates, G});
+
+  dnnl::engine eng(dnnl::engine::kind::cpu, 0);
+  dnnl::stream s(eng);
+  using tag = dnnl::memory::format_tag;
+
+  dnnl::memory::dim groups = std::stoi(G);
+  dnnl::memory::dims weight_dims_ = str2dims(weight_shape);
+  dnnl::memory::dims weight_dims = weight_dims_;
+  if (groups > 1) {
+    if (weight_dims_.size() == 5) {
+      weight_dims = {groups * weight_dims_[1], groups * weight_dims_[2], weight_dims_[3],
+                     weight_dims_[4]};
+    } else {
+      weight_dims[1] = weight_dims[1] * groups;
+    }
+  }
+  dnnl::memory::dims out_dims = str2dims(out_shape);
+  dnnl::memory::dims padding_dims = str2dims(paddings);
+  dnnl::memory::dims padding_dims_l(padding_dims.begin(),
+                                    padding_dims.begin() + padding_dims.size() / 2);
+  dnnl::memory::dims padding_dims_r(padding_dims.end() - padding_dims.size() / 2,
+                                    padding_dims.end());
+  dnnl::memory::dims output_padding_dims = str2dims(output_paddings);
+  dnnl::memory::dims strides_dims = str2dims(strides);
+  dnnl::memory::dims dilates_dims = str2dims(dilates, true);
+
+  dnnl::memory::dims input_dims = out_dims;
+  if (out_dims[1] == weight_dims[0]) {
+    input_dims[1] = weight_dims[1];
+  } else {
+    input_dims[1] = weight_dims[0];
+    std::swap(weight_dims[0], weight_dims[1]);
+  }
+  for (size_t i = 2; i < out_dims.size(); i++) {
+    dnnl::memory::dim K = weight_dims[i];
+    dnnl::memory::dim S = strides_dims[i - 2];
+    dnnl::memory::dim D = dilates_dims[i - 2];
+    dnnl::memory::dim PL = padding_dims_l[i - 2];
+    dnnl::memory::dim PR = padding_dims_r[i - 2];
+    dnnl::memory::dim OP = output_padding_dims[i - 2];
+    dnnl::memory::dim DK = 1 + (K - 1) * (D + 1);
+    input_dims[i] = (out_dims[i] - DK + PL + PR - OP) / S + 1;
+  }
+
+  dnnl::memory::dims deconv_src_dims = input_dims;
+  dnnl::memory::dims deconv_weights_dims = weight_dims;
+  if (groups > 1) {
+    deconv_weights_dims = {groups, out_dims[1] / groups, input_dims[1] / groups};
+    deconv_weights_dims.insert(deconv_weights_dims.end(), weight_dims.begin() + 2,
+                               weight_dims.end());
+  }
+  dnnl::memory::dims deconv_dst_dims = out_dims;
+  dnnl::memory::dims deconv_strides = strides_dims;
+  dnnl::memory::dims deconv_dilates = dilates_dims;
+  dnnl::memory::dims deconv_padding_l = padding_dims_l;
+  dnnl::memory::dims deconv_padding_r = padding_dims_r;
+
+  auto dnnl_dtype = dtype_dl2dnnl(tvm::runtime::String2DLDataType(dtype));
+  auto deconv_src_md = dnnl::memory::desc({deconv_src_dims}, dnnl_dtype, tag::any);
+  auto deconv_weights_md = dnnl::memory::desc({deconv_weights_dims}, dnnl_dtype, tag::any);
+  auto deconv_dst_md = dnnl::memory::desc({deconv_dst_dims}, dnnl_dtype, tag::any);
+
+  auto deconv_desc = dnnl::deconvolution_forward::desc(
+      dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct, deconv_src_md,
+      deconv_weights_md, deconv_dst_md, deconv_strides, deconv_dilates, deconv_padding_l,
+      deconv_padding_r);
+
+  auto deconv_prim_desc = dnnl::deconvolution_forward::primitive_desc(deconv_desc, eng);
+
+  auto src_format = deconv_prim_desc.src_desc();
+  auto weights_format = deconv_prim_desc.weights_desc();
+  auto dst_format = deconv_prim_desc.dst_desc();
+  std::string src_df, weight_df, dst_df;
+
+  src_df = md2fmt_tag_str(&src_format);
+  weight_df = md2fmt_tag_str(&weights_format);
+  dst_df = md2fmt_tag_str(&dst_format);
+  std::string res = src_df + "," + weight_df + "," + dst_df;
+  return res;
+}
+
+TVM_REGISTER_GLOBAL("relay.ir.get_optimal_layout_for_conv")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      *rv = get_optimal_layout_for_conv(args[0], args[1], args[2], args[3], args[4], args[5],
+                                        args[6], args[7], args[8]);
+    });
+
+TVM_REGISTER_GLOBAL("relay.ir.get_optimal_layout_for_conv_transpose")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      *rv = get_optimal_layout_for_conv_transpose(args[0], args[1], args[2], args[3], args[4],
+                                                  args[5], args[6], args[7], args[8], args[9]);
+    });
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
old mode 100755
new mode 100644
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
old mode 100755
new mode 100644
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py
old mode 100755
new mode 100644
index 3f7e999c7bca..7cde83e0cb77
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py
@@ -1,167 +1,167 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test code for float16 and uint8 global_avg_pool2d."""
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.topi.testing import adaptive_pool
-import tvm.topi.hexagon.qnn as qn
-import tvm.topi.hexagon.slice_ops as sl
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target
-
-
-SCALE_M_VAL = None
-ZERO_POINT_M_VAL = None
-SCALE_VAL = None
-ZERO_POINT_VAL = None
-
-
-class TestGlobalPool2D:
-    (input_shape,) = tvm.testing.parameters(
-        ([1, 32, 8, 8],),
-        ([1, 1056, 16, 16],),
-    )
-
-    # Fixed chunk layout is set as nchw-32c8h8w-2d for uint8 and nchw-32c8h4w-2d for float16.
-    # For optimization, it might get changed later.
-    # Since output shape will be NxCx1x1 which is not a
-    # multiple of fixed-chunk, output_layout is NCHW.
-    input_layout, output_layout, pool_type, layout, dtype = tvm.testing.parameters(
-        ("nchw-32c8h8w-2d", "nchw", "avg", "NCHW", "uint8"),
-        ("nchw-32c8h4w-2d", "nchw", "avg", "NCHW", "float16"),
-    )
-
-    @tvm.testing.fixture
-    def expected_output_np(
-        self,
-        input_np,
-        pool_type,
-        layout,
-    ):
-        """Generate expected output."""
-        ref_np = tvm.topi.testing.adaptive_pool(
-            input_np,
-            (1, 1),
-            pool_type,
-            layout,
-        )
-        return ref_np
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        if dtype in ("uint8", "int8"):
-            dtype = "float32"
-        return np.random.random(input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def quantize_input_np(self, input_np, dtype):
-        if dtype in ("uint8", "int8"):
-            global ZERO_POINT_VAL, SCALE_VAL
-            input_np_quantized, SCALE_VAL, ZERO_POINT_VAL = quantize_np(input_np, dtype)
-            return input_np_quantized
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, quantize_input_np, input_layout, layout, dtype):
-        if dtype == "float16":
-            return transform_numpy(input_np, layout.lower(), input_layout)
-        if dtype in ("uint8", "int8"):
-            return transform_numpy(quantize_input_np, layout.lower(), input_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def quantize_expected_output_np(self, expected_output_np, dtype):
-        if dtype in ("uint8", "int8"):
-            global ZERO_POINT_M_VAL, SCALE_M_VAL
-            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
-                expected_output_np, dtype
-            )
-
-            # Since output_layout is nchw, no transformation is needed.
-            return out_ref_quantized
-
-    @tvm.testing.requires_hexagon
-    def test_global_pool2d(
-        self,
-        dtype,
-        input_shape,
-        input_layout,
-        transformed_input_np,
-        expected_output_np,
-        quantize_expected_output_np,
-        hexagon_session,
-    ):
-        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
-
-        if dtype == "float16":
-            m_tensor = sl.global_avg_pool2d(a_tensor)
-            tir_schedule = sl.stir_global_avg_pool2d_schedule(m_tensor, a_tensor, input_layout)
-        elif dtype in ["uint8", "int8"]:
-            m_tensor = qn.global_avg_pool2d_u8(
-                a_tensor,
-                dtype,
-                ZERO_POINT_VAL,
-                SCALE_VAL,
-                ZERO_POINT_M_VAL,
-                SCALE_M_VAL,
-            )
-            tir_schedule = qn.stir_global_avg_pool2d_u8_schedule(m_tensor, a_tensor, input_layout)
-
-        sch = tir_schedule.mod
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, m_tensor],
-                get_hexagon_target("v69"),
-                name="global_pool2d",
-            )
-
-        input_axis_separator = [4]
-
-        a_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=dtype,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        m_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            expected_output_np.shape,
-            dtype=dtype,
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, m_data_nd)
-
-        # Convert nd to np
-        m_data_np = m_data_nd.numpy()
-
-        if dtype == "float16":
-            np.testing.assert_allclose(expected_output_np, m_data_np, rtol=1e-3, atol=1e-3)
-        elif dtype in ["int8", "uint8"]:
-            np.testing.assert_allclose(quantize_expected_output_np, m_data_np, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test code for float16 and uint8 global_avg_pool2d."""
+
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm.topi.testing import adaptive_pool
+import tvm.topi.hexagon.qnn as qn
+import tvm.topi.hexagon.slice_ops as sl
+from tvm.contrib.hexagon import allocate_hexagon_array
+from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target
+
+
+SCALE_M_VAL = None
+ZERO_POINT_M_VAL = None
+SCALE_VAL = None
+ZERO_POINT_VAL = None
+
+
+class TestGlobalPool2D:
+    (input_shape,) = tvm.testing.parameters(
+        ([1, 32, 8, 8],),
+        ([1, 1056, 16, 16],),
+    )
+
+    # Fixed chunk layout is set as nchw-32c8h8w-2d for uint8 and nchw-32c8h4w-2d for float16.
+    # For optimization, it might get changed later.
+    # Since output shape will be NxCx1x1 which is not a
+    # multiple of fixed-chunk, output_layout is NCHW.
+    input_layout, output_layout, pool_type, layout, dtype = tvm.testing.parameters(
+        ("nchw-32c8h8w-2d", "nchw", "avg", "NCHW", "uint8"),
+        ("nchw-32c8h4w-2d", "nchw", "avg", "NCHW", "float16"),
+    )
+
+    @tvm.testing.fixture
+    def expected_output_np(
+        self,
+        input_np,
+        pool_type,
+        layout,
+    ):
+        """Generate expected output."""
+        ref_np = tvm.topi.testing.adaptive_pool(
+            input_np,
+            (1, 1),
+            pool_type,
+            layout,
+        )
+        return ref_np
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        if dtype in ("uint8", "int8"):
+            dtype = "float32"
+        return np.random.random(input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def quantize_input_np(self, input_np, dtype):
+        if dtype in ("uint8", "int8"):
+            global ZERO_POINT_VAL, SCALE_VAL
+            input_np_quantized, SCALE_VAL, ZERO_POINT_VAL = quantize_np(input_np, dtype)
+            return input_np_quantized
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, quantize_input_np, input_layout, layout, dtype):
+        if dtype == "float16":
+            return transform_numpy(input_np, layout.lower(), input_layout)
+        if dtype in ("uint8", "int8"):
+            return transform_numpy(quantize_input_np, layout.lower(), input_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def quantize_expected_output_np(self, expected_output_np, dtype):
+        if dtype in ("uint8", "int8"):
+            global ZERO_POINT_M_VAL, SCALE_M_VAL
+            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
+                expected_output_np, dtype
+            )
+
+            # Since output_layout is nchw, no transformation is needed.
+            return out_ref_quantized
+
+    @tvm.testing.requires_hexagon
+    def test_global_pool2d(
+        self,
+        dtype,
+        input_shape,
+        input_layout,
+        transformed_input_np,
+        expected_output_np,
+        quantize_expected_output_np,
+        hexagon_session,
+    ):
+        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
+
+        if dtype == "float16":
+            m_tensor = sl.global_avg_pool2d(a_tensor)
+            tir_schedule = sl.stir_global_avg_pool2d_schedule(m_tensor, a_tensor, input_layout)
+        elif dtype in ["uint8", "int8"]:
+            m_tensor = qn.global_avg_pool2d_u8(
+                a_tensor,
+                dtype,
+                ZERO_POINT_VAL,
+                SCALE_VAL,
+                ZERO_POINT_M_VAL,
+                SCALE_M_VAL,
+            )
+            tir_schedule = qn.stir_global_avg_pool2d_u8_schedule(m_tensor, a_tensor, input_layout)
+
+        sch = tir_schedule.mod
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [a_tensor, m_tensor],
+                get_hexagon_target("v69"),
+                name="global_pool2d",
+            )
+
+        input_axis_separator = [4]
+
+        a_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=dtype,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        m_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            expected_output_np.shape,
+            dtype=dtype,
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(a_data_nd, m_data_nd)
+
+        # Convert nd to np
+        m_data_np = m_data_nd.numpy()
+
+        if dtype == "float16":
+            np.testing.assert_allclose(expected_output_np, m_data_np, rtol=1e-3, atol=1e-3)
+        elif dtype in ["int8", "uint8"]:
+            np.testing.assert_allclose(quantize_expected_output_np, m_data_np, atol=1)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py b/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py
old mode 100755
new mode 100644
index 4d4aef25e33f..e5b6c4d79065
--- a/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_adaptive_avg_pool1d.py
@@ -1,185 +1,185 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test code for specialized case of adaptive_avg_pool1d."""
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.topi.testing import adaptive_pool
-import tvm.topi.hexagon.qnn as s1
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ..infrastructure import transform_numpy, quantize_np
-
-
-SCALE_M_VAL = None
-ZERO_POINT_M_VAL = None
-SCALE_VAL = None
-ZERO_POINT_VAL = None
-
-
-class TestAdaptivePool1D:
-    """Test specialized case of adaptive_avg_pool1d."""
-
-    (input_shape,) = tvm.testing.parameters(
-        ([1, 128, 128],),
-        ([1, 64, 64],),
-        ([1, 64, 128],),
-        ([1, 32, 64],),
-        ([1, 128, 768],),
-    )
-
-    # Fixed chunk layout is set as ncw-32c64w-2d for now.
-    # The adaptive_avg_pool1d implementation only handles specialized case
-    # where output_size is 1 as it appears on quantized distilbert model.
-    # Since output size won't be a multiple of fixed-chunk,
-    # output_layout is ncw.
-    # For optimization, it might get changed later.
-    input_layout, output_layout, pool_type, layout, output_size, dtype, = tvm.testing.parameters(
-        (
-            "ncw-32c64w-2d",
-            "ncw",
-            "avg",
-            "NCW",
-            [1],
-            "uint8",
-        )
-    )
-
-    @tvm.testing.fixture
-    def expected_output_np(
-        self,
-        input_np,
-        output_size,
-        pool_type,
-        layout,
-    ):
-        """Generate expected output."""
-        out_width = output_size[0]
-
-        ref_np = adaptive_pool(
-            input_np,
-            out_width,
-            pool_type,
-            layout,
-        )
-        return ref_np
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        if dtype in ("uint8", "int8"):
-            dtype = "float32"
-        return np.random.random(input_shape).astype(dtype)
-
-    @tvm.testing.fixture
-    def quantize_input_np(self, input_np, dtype):
-        if dtype in ("uint8", "int8"):
-            global ZERO_POINT_VAL, SCALE_VAL
-            input_np_quantized, SCALE_VAL, ZERO_POINT_VAL = quantize_np(input_np, dtype)
-            return input_np_quantized
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, quantize_input_np, input_layout, layout, dtype):
-        if dtype in ("uint8", "int8"):
-            return transform_numpy(quantize_input_np, layout.lower(), input_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def quantize_expected_output_np(self, expected_output_np, dtype):
-        """Generate expected output."""
-        if dtype in ("uint8", "int8"):
-            global ZERO_POINT_M_VAL, SCALE_M_VAL
-            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
-                expected_output_np, dtype
-            )
-
-            # Since output_layout is ncw, no transformation is needed.
-            return out_ref_quantized
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.requires_hexagon
-    def test_pool1d(
-        self,
-        dtype,
-        output_size,
-        input_layout,
-        output_layout,
-        input_shape,
-        transformed_input_np,
-        quantize_expected_output_np,
-        hexagon_session,
-    ):
-        """Test adaptive_avg_pool1d."""
-        target_hexagon = tvm.target.hexagon("v69")
-        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
-
-        m_tensor = s1.adaptive_avg_pool1d(
-            a_tensor,
-            output_size,
-            dtype,
-            ZERO_POINT_VAL,
-            SCALE_VAL,
-            ZERO_POINT_M_VAL,
-            SCALE_M_VAL,
-        )
-
-        tir_schedule = s1.tir_adaptive_avg_pool1d_schedule(
-            m_tensor, a_tensor, output_layout, input_layout
-        )
-
-        sch = tir_schedule.mod
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, m_tensor],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
-                name="adaptive_pool1d",
-            )
-
-        input_axis_separator = [3]
-
-        a_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=dtype,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        m_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            quantize_expected_output_np.shape,
-            dtype=dtype,
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, m_data_nd)
-
-        # Convert nd to np
-        m_data_np = m_data_nd.numpy()
-
-        np.testing.assert_allclose(quantize_expected_output_np, m_data_np, atol=2)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test code for specialized case of adaptive_avg_pool1d."""
+
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm.topi.testing import adaptive_pool
+import tvm.topi.hexagon.qnn as s1
+from tvm.contrib.hexagon import allocate_hexagon_array
+from ..infrastructure import transform_numpy, quantize_np
+
+
+SCALE_M_VAL = None
+ZERO_POINT_M_VAL = None
+SCALE_VAL = None
+ZERO_POINT_VAL = None
+
+
+class TestAdaptivePool1D:
+    """Test specialized case of adaptive_avg_pool1d."""
+
+    (input_shape,) = tvm.testing.parameters(
+        ([1, 128, 128],),
+        ([1, 64, 64],),
+        ([1, 64, 128],),
+        ([1, 32, 64],),
+        ([1, 128, 768],),
+    )
+
+    # Fixed chunk layout is set as ncw-32c64w-2d for now.
+    # The adaptive_avg_pool1d implementation only handles specialized case
+    # where output_size is 1 as it appears on quantized distilbert model.
+    # Since output size won't be a multiple of fixed-chunk,
+    # output_layout is ncw.
+    # For optimization, it might get changed later.
+    input_layout, output_layout, pool_type, layout, output_size, dtype, = tvm.testing.parameters(
+        (
+            "ncw-32c64w-2d",
+            "ncw",
+            "avg",
+            "NCW",
+            [1],
+            "uint8",
+        )
+    )
+
+    @tvm.testing.fixture
+    def expected_output_np(
+        self,
+        input_np,
+        output_size,
+        pool_type,
+        layout,
+    ):
+        """Generate expected output."""
+        out_width = output_size[0]
+
+        ref_np = adaptive_pool(
+            input_np,
+            out_width,
+            pool_type,
+            layout,
+        )
+        return ref_np
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        if dtype in ("uint8", "int8"):
+            dtype = "float32"
+        return np.random.random(input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def quantize_input_np(self, input_np, dtype):
+        if dtype in ("uint8", "int8"):
+            global ZERO_POINT_VAL, SCALE_VAL
+            input_np_quantized, SCALE_VAL, ZERO_POINT_VAL = quantize_np(input_np, dtype)
+            return input_np_quantized
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, quantize_input_np, input_layout, layout, dtype):
+        if dtype in ("uint8", "int8"):
+            return transform_numpy(quantize_input_np, layout.lower(), input_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def quantize_expected_output_np(self, expected_output_np, dtype):
+        """Generate expected output."""
+        if dtype in ("uint8", "int8"):
+            global ZERO_POINT_M_VAL, SCALE_M_VAL
+            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
+                expected_output_np, dtype
+            )
+
+            # Since output_layout is ncw, no transformation is needed.
+            return out_ref_quantized
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.requires_hexagon
+    def test_pool1d(
+        self,
+        dtype,
+        output_size,
+        input_layout,
+        output_layout,
+        input_shape,
+        transformed_input_np,
+        quantize_expected_output_np,
+        hexagon_session,
+    ):
+        """Test adaptive_avg_pool1d."""
+        target_hexagon = tvm.target.hexagon("v69")
+        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
+
+        m_tensor = s1.adaptive_avg_pool1d(
+            a_tensor,
+            output_size,
+            dtype,
+            ZERO_POINT_VAL,
+            SCALE_VAL,
+            ZERO_POINT_M_VAL,
+            SCALE_M_VAL,
+        )
+
+        tir_schedule = s1.tir_adaptive_avg_pool1d_schedule(
+            m_tensor, a_tensor, output_layout, input_layout
+        )
+
+        sch = tir_schedule.mod
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [a_tensor, m_tensor],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="adaptive_pool1d",
+            )
+
+        input_axis_separator = [3]
+
+        a_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=dtype,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        m_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            quantize_expected_output_np.shape,
+            dtype=dtype,
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(a_data_nd, m_data_nd)
+
+        # Convert nd to np
+        m_data_np = m_data_nd.numpy()
+
+        np.testing.assert_allclose(quantize_expected_output_np, m_data_np, atol=2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
index e0bb6b5864d3..94cb5ffca543 100644
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -1,411 +1,411 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for Add, Subtract and Multiply."""
-import numpy as np
-
-import tvm
-from tvm import te
-import tvm.topi.hexagon.slice_ops as sl
-import tvm.topi.hexagon.qnn as qn
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ..infrastructure import (
-    transform_numpy,
-    quantize_np,
-    get_hexagon_target,
-)
-
-ZERO_POINT_A_VAL = None
-SCALE_A_VAL = None
-
-ZERO_POINT_B_VAL = None
-SCALE_B_VAL = None
-
-ZERO_POINT_M_VAL = None
-SCALE_M_VAL = None
-
-
-def hexagon_wrapper_allocation(
-    device,
-    layout,
-    axis_separators,
-    tensor_shape=None,
-    data_original=None,
-    transformed_data=None,
-    dtype=None,
-):
-    """Input layout can either be nhwc-8h2w32c2w-2d or nhwc"""
-    if layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d"]:
-        data_nd = allocate_hexagon_array(
-            device,
-            tensor_shape=tensor_shape,
-            data=transformed_data,
-            dtype=dtype,
-            axis_separators=axis_separators,
-            mem_scope="global.vtcm",
-        )
-    elif layout == "nhwc":
-        data_nd = allocate_hexagon_array(
-            device,
-            data=data_original,
-        )
-    return data_nd
-
-
-class TestAddSubtractMultiplyBroadcast2d:
-    """Test Add, Subtract and Multiply class."""
-
-    (
-        input_shape_a,
-        input_shape_b,
-        input_a_layout,
-        input_b_layout,
-        output_layout,
-        dtype,
-    ) = tvm.testing.parameters(
-        # no broadcast needed - short input
-        (
-            [1, 8, 4, 32],
-            [1, 8, 4, 32],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # no broadcast needed - large input
-        (
-            [1, 56, 64, 128],
-            [1, 56, 64, 128],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # one input needs broadcast
-        (
-            [1, 56, 64, 128],
-            [1, 1, 64, 1],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # Both input needs broadcast
-        (
-            [1, 56, 1, 128],
-            [1, 1, 64, 1],
-            "nhwc",
-            "nhwc",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # One axis in one input needs broadcast
-        (
-            [1, 56, 20, 128],
-            [1, 56, 20, 1],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        # broadcast all axes in one input
-        (
-            [1, 48, 56, 32],
-            [1, 1, 1, 1],
-            "nhwc-8h2w32c2w-2d",
-            "nhwc",
-            "nhwc-8h2w32c2w-2d",
-            "float16",
-        ),
-        (
-            [1, 48, 32, 64],
-            [1, 48, 32, 64],
-            "nhwc-8h8w32c-2d",
-            "nhwc-8h8w32c-2d",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast axis 2 in one input
-        (
-            [1, 48, 32, 64],
-            [1, 48, 1, 64],
-            "nhwc-8h8w32c-2d",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast axis 1 in one input
-        (
-            [1, 48, 32, 64],
-            [1, 1, 32, 64],
-            "nhwc-8h8w32c-2d",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast axis 3 in one input
-        (
-            [1, 8, 8, 32],
-            [1, 8, 8, 1],
-            "nhwc-8h8w32c-2d",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast both inputs
-        (
-            [1, 56, 1, 128],
-            [1, 1, 64, 1],
-            "nhwc",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast both inputs
-        (
-            [1, 48, 1, 1],
-            [1, 1, 32, 32],
-            "nhwc",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast both inputs
-        (
-            [1, 48, 1, 32],
-            [1, 1, 32, 1],
-            "nhwc",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-        # broadcast all axes in one input
-        (
-            [1, 48, 56, 32],
-            [1, 1, 1, 1],
-            "nhwc-8h8w32c-2d",
-            "nhwc",
-            "nhwc-8h8w32c-2d",
-            "uint8",
-        ),
-    )
-
-    op_name = tvm.testing.parameter("add", "subtract", "multiply")
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np_a, input_np_b, op_name):
-        """Generate expected output."""
-        if op_name == "add":
-            out_ref = np.add(input_np_a, input_np_b)
-        elif op_name == "subtract":
-            out_ref = np.subtract(input_np_a, input_np_b)
-        elif op_name == "multiply":
-            out_ref = np.multiply(input_np_a, input_np_b)
-        return out_ref
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, output_layout, dtype):
-        """Generate expected output."""
-        if dtype == "float16":
-            return transform_numpy(expected_output_np, "nhwc", output_layout)
-        if dtype in ["uint8", "int8"]:
-            global ZERO_POINT_M_VAL, SCALE_M_VAL
-            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
-                expected_output_np, dtype
-            )
-            return transform_numpy(out_ref_quantized, "nhwc", output_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def input_np_a(self, input_shape_a, dtype):
-        """Generate numpy input for variable a."""
-        if dtype in ["uint8", "int8"]:
-            dtype = "float32"
-        return np.random.random(input_shape_a).astype(dtype)
-
-    @tvm.testing.fixture
-    def input_np_b(self, input_shape_b, dtype):
-        """Generate numpy input for variable b."""
-        if dtype in ["uint8", "int8"]:
-            dtype = "float32"
-        return np.random.random(input_shape_b).astype(dtype)
-
-    @tvm.testing.fixture
-    def quantize_input_np_a(self, input_np_a, dtype):
-        if dtype in ["uint8", "int8"]:
-            global ZERO_POINT_A_VAL, SCALE_A_VAL
-            input_np_a_quantized, SCALE_A_VAL, ZERO_POINT_A_VAL = quantize_np(input_np_a, dtype)
-            return input_np_a_quantized
-        return None
-
-    @tvm.testing.fixture
-    def quantize_input_np_b(self, input_np_b, dtype):
-        if dtype in ["uint8", "int8"]:
-            global ZERO_POINT_B_VAL, SCALE_B_VAL
-            input_np_b_quantized, SCALE_B_VAL, ZERO_POINT_B_VAL = quantize_np(input_np_b, dtype)
-            return input_np_b_quantized
-        return None
-
-    @tvm.testing.fixture
-    def transformed_input_np_a(self, input_np_a, quantize_input_np_a, input_a_layout, dtype):
-        if dtype == "float16":
-            return transform_numpy(input_np_a, "nhwc", input_a_layout)
-        if dtype in ["uint8", "int8"]:
-            return transform_numpy(quantize_input_np_a, "nhwc", input_a_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def transformed_input_np_b(self, input_np_b, quantize_input_np_b, input_b_layout, dtype):
-        if dtype == "float16":
-            return transform_numpy(input_np_b, "nhwc", input_b_layout)
-        if dtype in ["uint8", "int8"]:
-            return transform_numpy(quantize_input_np_b, "nhwc", input_b_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.requires_hexagon
-    def test_transform(
-        self,
-        dtype,
-        input_shape_a,
-        input_shape_b,
-        input_np_a,
-        input_np_b,
-        quantize_input_np_a,
-        quantize_input_np_b,
-        transformed_input_np_a,
-        transformed_input_np_b,
-        expected_output_np,
-        transformed_expected_output_np,
-        hexagon_session,
-        output_layout,
-        input_a_layout,
-        input_b_layout,
-        op_name,
-    ):
-        """Test transform."""
-        output_shape = expected_output_np.shape
-        a_tensor = te.placeholder(input_shape_a, name="a_tensor", dtype=dtype)
-        b_tensor = te.placeholder(input_shape_b, name="b_tensor", dtype=dtype)
-        if dtype == "float16":
-            if op_name == "add":
-                m_tensor = sl.add_broadcast_compute(a_tensor, b_tensor)
-            elif op_name == "subtract":
-                m_tensor = sl.subtract_broadcast_compute(a_tensor, b_tensor)
-            elif op_name == "multiply":
-                m_tensor = sl.multiply_broadcast_compute(a_tensor, b_tensor)
-            tir_schedule = sl.tir_broadcast_schedule(
-                m_tensor, a_tensor, b_tensor, output_layout, input_a_layout, input_b_layout, op_name
-            )
-        elif dtype in ["uint8", "int8"]:
-            args = [
-                a_tensor,
-                b_tensor,
-                output_shape,
-                ZERO_POINT_A_VAL,
-                SCALE_A_VAL,
-                ZERO_POINT_B_VAL,
-                SCALE_B_VAL,
-                ZERO_POINT_M_VAL,
-                SCALE_M_VAL,
-                dtype,
-            ]
-            if op_name == "add":
-                m_tensor = qn.qadd_broadcast_compute(*args)
-            elif op_name == "subtract":
-                m_tensor = qn.qsubtract_broadcast_compute(*args)
-            elif op_name == "multiply":
-                m_tensor = qn.qmultiply_broadcast_compute(*args)
-            tir_schedule = qn.tir_schedule_quant(
-                m_tensor, a_tensor, b_tensor, output_layout, input_a_layout, input_b_layout
-            )
-
-        sch = tir_schedule.mod
-
-        input_axis_separator = [4]
-        if output_layout in (
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h8w32c-2d",
-        ):
-            output_axis_separator = [4]
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, b_tensor, m_tensor],
-                get_hexagon_target("v69"),
-                name="slice_op_with_transform",
-            )
-
-        if dtype == "float16":
-            in_data_np_a = input_np_a
-            in_data_np_b = input_np_b
-        elif dtype in ["int8", "uint8"]:
-            in_data_np_a = quantize_input_np_a
-            in_data_np_b = quantize_input_np_b
-        else:
-            raise RuntimeError(f"Unsupport dtype '{dtype}'")
-
-        a_data_nd = hexagon_wrapper_allocation(
-            hexagon_session.device,
-            layout=input_a_layout,
-            data_original=in_data_np_a,
-            transformed_data=transformed_input_np_a,
-            axis_separators=input_axis_separator,
-        )
-        b_data_nd = hexagon_wrapper_allocation(
-            hexagon_session.device,
-            layout=input_b_layout,
-            data_original=in_data_np_b,
-            transformed_data=transformed_input_np_b,
-            axis_separators=input_axis_separator,
-        )
-        m_data_nd = hexagon_wrapper_allocation(
-            hexagon_session.device,
-            layout=output_layout,
-            tensor_shape=transformed_expected_output_np.shape,
-            axis_separators=output_axis_separator,
-            dtype=dtype,
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, b_data_nd, m_data_nd)
-
-        batch, height, width, channel = output_shape
-        # convert nd to np and reshape to fixed chunk size layout
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            m_data_np = m_data_nd.numpy().reshape(
-                [batch, height // 8, width // 4, channel // 32, 8, 2, 32, 2]
-            )
-        elif output_layout == "nhwc-8h8w32c-2d":
-            m_data_np = m_data_nd.numpy().reshape(
-                [batch, height // 8, width // 8, channel // 32, 8, 8, 32]
-            )
-
-        if dtype == "float16":
-            np.testing.assert_allclose(
-                transformed_expected_output_np, m_data_np, rtol=1e-3, atol=1e-3
-            )
-        elif dtype in ["int8", "uint8"]:
-            np.testing.assert_allclose(transformed_expected_output_np, m_data_np, rtol=1, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for Add, Subtract and Multiply."""
+import numpy as np
+
+import tvm
+from tvm import te
+import tvm.topi.hexagon.slice_ops as sl
+import tvm.topi.hexagon.qnn as qn
+from tvm.contrib.hexagon import allocate_hexagon_array
+from ..infrastructure import (
+    transform_numpy,
+    quantize_np,
+    get_hexagon_target,
+)
+
+ZERO_POINT_A_VAL = None
+SCALE_A_VAL = None
+
+ZERO_POINT_B_VAL = None
+SCALE_B_VAL = None
+
+ZERO_POINT_M_VAL = None
+SCALE_M_VAL = None
+
+
+def hexagon_wrapper_allocation(
+    device,
+    layout,
+    axis_separators,
+    tensor_shape=None,
+    data_original=None,
+    transformed_data=None,
+    dtype=None,
+):
+    """Input layout can either be nhwc-8h2w32c2w-2d or nhwc"""
+    if layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d"]:
+        data_nd = allocate_hexagon_array(
+            device,
+            tensor_shape=tensor_shape,
+            data=transformed_data,
+            dtype=dtype,
+            axis_separators=axis_separators,
+            mem_scope="global.vtcm",
+        )
+    elif layout == "nhwc":
+        data_nd = allocate_hexagon_array(
+            device,
+            data=data_original,
+        )
+    return data_nd
+
+
+class TestAddSubtractMultiplyBroadcast2d:
+    """Test Add, Subtract and Multiply class."""
+
+    (
+        input_shape_a,
+        input_shape_b,
+        input_a_layout,
+        input_b_layout,
+        output_layout,
+        dtype,
+    ) = tvm.testing.parameters(
+        # no broadcast needed - short input
+        (
+            [1, 8, 4, 32],
+            [1, 8, 4, 32],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # no broadcast needed - large input
+        (
+            [1, 56, 64, 128],
+            [1, 56, 64, 128],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # one input needs broadcast
+        (
+            [1, 56, 64, 128],
+            [1, 1, 64, 1],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # Both input needs broadcast
+        (
+            [1, 56, 1, 128],
+            [1, 1, 64, 1],
+            "nhwc",
+            "nhwc",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # One axis in one input needs broadcast
+        (
+            [1, 56, 20, 128],
+            [1, 56, 20, 1],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # broadcast all axes in one input
+        (
+            [1, 48, 56, 32],
+            [1, 1, 1, 1],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 48, 32, 64],
+            [1, 48, 32, 64],
+            "nhwc-8h8w32c-2d",
+            "nhwc-8h8w32c-2d",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast axis 2 in one input
+        (
+            [1, 48, 32, 64],
+            [1, 48, 1, 64],
+            "nhwc-8h8w32c-2d",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast axis 1 in one input
+        (
+            [1, 48, 32, 64],
+            [1, 1, 32, 64],
+            "nhwc-8h8w32c-2d",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast axis 3 in one input
+        (
+            [1, 8, 8, 32],
+            [1, 8, 8, 1],
+            "nhwc-8h8w32c-2d",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast both inputs
+        (
+            [1, 56, 1, 128],
+            [1, 1, 64, 1],
+            "nhwc",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast both inputs
+        (
+            [1, 48, 1, 1],
+            [1, 1, 32, 32],
+            "nhwc",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast both inputs
+        (
+            [1, 48, 1, 32],
+            [1, 1, 32, 1],
+            "nhwc",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+        # broadcast all axes in one input
+        (
+            [1, 48, 56, 32],
+            [1, 1, 1, 1],
+            "nhwc-8h8w32c-2d",
+            "nhwc",
+            "nhwc-8h8w32c-2d",
+            "uint8",
+        ),
+    )
+
+    op_name = tvm.testing.parameter("add", "subtract", "multiply")
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np_a, input_np_b, op_name):
+        """Generate expected output."""
+        if op_name == "add":
+            out_ref = np.add(input_np_a, input_np_b)
+        elif op_name == "subtract":
+            out_ref = np.subtract(input_np_a, input_np_b)
+        elif op_name == "multiply":
+            out_ref = np.multiply(input_np_a, input_np_b)
+        return out_ref
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, output_layout, dtype):
+        """Generate expected output."""
+        if dtype == "float16":
+            return transform_numpy(expected_output_np, "nhwc", output_layout)
+        if dtype in ["uint8", "int8"]:
+            global ZERO_POINT_M_VAL, SCALE_M_VAL
+            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
+                expected_output_np, dtype
+            )
+            return transform_numpy(out_ref_quantized, "nhwc", output_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def input_np_a(self, input_shape_a, dtype):
+        """Generate numpy input for variable a."""
+        if dtype in ["uint8", "int8"]:
+            dtype = "float32"
+        return np.random.random(input_shape_a).astype(dtype)
+
+    @tvm.testing.fixture
+    def input_np_b(self, input_shape_b, dtype):
+        """Generate numpy input for variable b."""
+        if dtype in ["uint8", "int8"]:
+            dtype = "float32"
+        return np.random.random(input_shape_b).astype(dtype)
+
+    @tvm.testing.fixture
+    def quantize_input_np_a(self, input_np_a, dtype):
+        if dtype in ["uint8", "int8"]:
+            global ZERO_POINT_A_VAL, SCALE_A_VAL
+            input_np_a_quantized, SCALE_A_VAL, ZERO_POINT_A_VAL = quantize_np(input_np_a, dtype)
+            return input_np_a_quantized
+        return None
+
+    @tvm.testing.fixture
+    def quantize_input_np_b(self, input_np_b, dtype):
+        if dtype in ["uint8", "int8"]:
+            global ZERO_POINT_B_VAL, SCALE_B_VAL
+            input_np_b_quantized, SCALE_B_VAL, ZERO_POINT_B_VAL = quantize_np(input_np_b, dtype)
+            return input_np_b_quantized
+        return None
+
+    @tvm.testing.fixture
+    def transformed_input_np_a(self, input_np_a, quantize_input_np_a, input_a_layout, dtype):
+        if dtype == "float16":
+            return transform_numpy(input_np_a, "nhwc", input_a_layout)
+        if dtype in ["uint8", "int8"]:
+            return transform_numpy(quantize_input_np_a, "nhwc", input_a_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def transformed_input_np_b(self, input_np_b, quantize_input_np_b, input_b_layout, dtype):
+        if dtype == "float16":
+            return transform_numpy(input_np_b, "nhwc", input_b_layout)
+        if dtype in ["uint8", "int8"]:
+            return transform_numpy(quantize_input_np_b, "nhwc", input_b_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.requires_hexagon
+    def test_transform(
+        self,
+        dtype,
+        input_shape_a,
+        input_shape_b,
+        input_np_a,
+        input_np_b,
+        quantize_input_np_a,
+        quantize_input_np_b,
+        transformed_input_np_a,
+        transformed_input_np_b,
+        expected_output_np,
+        transformed_expected_output_np,
+        hexagon_session,
+        output_layout,
+        input_a_layout,
+        input_b_layout,
+        op_name,
+    ):
+        """Test transform."""
+        output_shape = expected_output_np.shape
+        a_tensor = te.placeholder(input_shape_a, name="a_tensor", dtype=dtype)
+        b_tensor = te.placeholder(input_shape_b, name="b_tensor", dtype=dtype)
+        if dtype == "float16":
+            if op_name == "add":
+                m_tensor = sl.add_broadcast_compute(a_tensor, b_tensor)
+            elif op_name == "subtract":
+                m_tensor = sl.subtract_broadcast_compute(a_tensor, b_tensor)
+            elif op_name == "multiply":
+                m_tensor = sl.multiply_broadcast_compute(a_tensor, b_tensor)
+            tir_schedule = sl.tir_broadcast_schedule(
+                m_tensor, a_tensor, b_tensor, output_layout, input_a_layout, input_b_layout, op_name
+            )
+        elif dtype in ["uint8", "int8"]:
+            args = [
+                a_tensor,
+                b_tensor,
+                output_shape,
+                ZERO_POINT_A_VAL,
+                SCALE_A_VAL,
+                ZERO_POINT_B_VAL,
+                SCALE_B_VAL,
+                ZERO_POINT_M_VAL,
+                SCALE_M_VAL,
+                dtype,
+            ]
+            if op_name == "add":
+                m_tensor = qn.qadd_broadcast_compute(*args)
+            elif op_name == "subtract":
+                m_tensor = qn.qsubtract_broadcast_compute(*args)
+            elif op_name == "multiply":
+                m_tensor = qn.qmultiply_broadcast_compute(*args)
+            tir_schedule = qn.tir_schedule_quant(
+                m_tensor, a_tensor, b_tensor, output_layout, input_a_layout, input_b_layout
+            )
+
+        sch = tir_schedule.mod
+
+        input_axis_separator = [4]
+        if output_layout in (
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h8w32c-2d",
+        ):
+            output_axis_separator = [4]
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [a_tensor, b_tensor, m_tensor],
+                get_hexagon_target("v69"),
+                name="slice_op_with_transform",
+            )
+
+        if dtype == "float16":
+            in_data_np_a = input_np_a
+            in_data_np_b = input_np_b
+        elif dtype in ["int8", "uint8"]:
+            in_data_np_a = quantize_input_np_a
+            in_data_np_b = quantize_input_np_b
+        else:
+            raise RuntimeError(f"Unsupport dtype '{dtype}'")
+
+        a_data_nd = hexagon_wrapper_allocation(
+            hexagon_session.device,
+            layout=input_a_layout,
+            data_original=in_data_np_a,
+            transformed_data=transformed_input_np_a,
+            axis_separators=input_axis_separator,
+        )
+        b_data_nd = hexagon_wrapper_allocation(
+            hexagon_session.device,
+            layout=input_b_layout,
+            data_original=in_data_np_b,
+            transformed_data=transformed_input_np_b,
+            axis_separators=input_axis_separator,
+        )
+        m_data_nd = hexagon_wrapper_allocation(
+            hexagon_session.device,
+            layout=output_layout,
+            tensor_shape=transformed_expected_output_np.shape,
+            axis_separators=output_axis_separator,
+            dtype=dtype,
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(a_data_nd, b_data_nd, m_data_nd)
+
+        batch, height, width, channel = output_shape
+        # convert nd to np and reshape to fixed chunk size layout
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            m_data_np = m_data_nd.numpy().reshape(
+                [batch, height // 8, width // 4, channel // 32, 8, 2, 32, 2]
+            )
+        elif output_layout == "nhwc-8h8w32c-2d":
+            m_data_np = m_data_nd.numpy().reshape(
+                [batch, height // 8, width // 8, channel // 32, 8, 8, 32]
+            )
+
+        if dtype == "float16":
+            np.testing.assert_allclose(
+                transformed_expected_output_np, m_data_np, rtol=1e-3, atol=1e-3
+            )
+        elif dtype in ["int8", "uint8"]:
+            np.testing.assert_allclose(transformed_expected_output_np, m_data_np, rtol=1, atol=1)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_quantize.py b/tests/python/contrib/test_hexagon/topi/test_quantize.py
index ac4f4d4e3047..2c54b12ab98e 100644
--- a/tests/python/contrib/test_hexagon/topi/test_quantize.py
+++ b/tests/python/contrib/test_hexagon/topi/test_quantize.py
@@ -1,128 +1,128 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TIR quantize schedule tests."""
-import numpy as np
-
-import tvm
-from tvm import te
-import tvm.topi.hexagon.qnn as s1
-from tvm.contrib.hexagon import allocate_hexagon_array
-from ..infrastructure import (
-    transform_numpy,
-    quantize_np,
-    get_hexagon_target,
-)
-
-QUANTIZE_SCALE = None
-QUANTIZE_ZERO_POINT = None
-
-
-class TestQuantize:
-    """Test quantize class."""
-
-    @tvm.testing.fixture
-    def expected_output_np(self, input_np, output_dtype):
-        global QUANTIZE_SCALE, QUANTIZE_ZERO_POINT
-        quant_np, QUANTIZE_SCALE, QUANTIZE_ZERO_POINT = quantize_np(input_np, output_dtype)
-        return quant_np
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, input_dtype):
-        return np.random.random(input_shape).astype(input_dtype)
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, input_crouton_layout):
-        return transform_numpy(input_np, "nhwc", input_crouton_layout)
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, output_layout):
-        return transform_numpy(expected_output_np, "nhwc", output_layout)
-
-    input_crouton_layout, output_layout, input_dtype = tvm.testing.parameters(
-        ("nhwc-4h2w32c2w-2d", "nhwc-8h8w32c-2d", "float32"),
-    )
-
-    output_dtype = tvm.testing.parameter("uint8", "int8")
-
-    input_shape = tvm.testing.parameter(
-        (1, 8, 8, 32), (1, 16, 16, 32), (1, 16, 16, 128), (1, 64, 64, 64)
-    )
-
-    @tvm.testing.requires_hexagon
-    def test_quantize(
-        self,
-        input_dtype,
-        output_dtype,
-        transformed_input_np,
-        input_shape,
-        expected_output_np,
-        transformed_expected_output_np,
-        input_crouton_layout,
-        output_layout,
-        hexagon_session,
-    ):
-        """Test quantize."""
-        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=input_dtype)
-
-        m_tensor = s1.quantize_compute(a_tensor, QUANTIZE_SCALE, QUANTIZE_ZERO_POINT, output_dtype)
-
-        tir_schedule = s1.tir_quantize_schedule(
-            m_tensor, a_tensor, input_crouton_layout, output_layout
-        )
-
-        sch = tir_schedule.mod
-
-        input_axis_separator = [4]
-        output_axis_separator = [4]
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, m_tensor],
-                get_hexagon_target("v69"),
-                name="quantize",
-            )
-
-        a_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=input_dtype,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        m_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            tensor_shape=transformed_expected_output_np.shape,
-            dtype=output_dtype,
-            axis_separators=output_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, m_data_nd)
-
-        b, h, weight, c = expected_output_np.shape
-
-        # convert nd to np and reshape to fixed chunk size layout
-        m_data_np = m_data_nd.numpy().reshape([b, h // 8, weight // 8, c // 32, 8, 8, 32])
-
-        np.testing.assert_allclose(transformed_expected_output_np, m_data_np, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""TIR quantize schedule tests."""
+import numpy as np
+
+import tvm
+from tvm import te
+import tvm.topi.hexagon.qnn as s1
+from tvm.contrib.hexagon import allocate_hexagon_array
+from ..infrastructure import (
+    transform_numpy,
+    quantize_np,
+    get_hexagon_target,
+)
+
+QUANTIZE_SCALE = None
+QUANTIZE_ZERO_POINT = None
+
+
+class TestQuantize:
+    """Test quantize class."""
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np, output_dtype):
+        global QUANTIZE_SCALE, QUANTIZE_ZERO_POINT
+        quant_np, QUANTIZE_SCALE, QUANTIZE_ZERO_POINT = quantize_np(input_np, output_dtype)
+        return quant_np
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, input_dtype):
+        return np.random.random(input_shape).astype(input_dtype)
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, input_crouton_layout):
+        return transform_numpy(input_np, "nhwc", input_crouton_layout)
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, output_layout):
+        return transform_numpy(expected_output_np, "nhwc", output_layout)
+
+    input_crouton_layout, output_layout, input_dtype = tvm.testing.parameters(
+        ("nhwc-4h2w32c2w-2d", "nhwc-8h8w32c-2d", "float32"),
+    )
+
+    output_dtype = tvm.testing.parameter("uint8", "int8")
+
+    input_shape = tvm.testing.parameter(
+        (1, 8, 8, 32), (1, 16, 16, 32), (1, 16, 16, 128), (1, 64, 64, 64)
+    )
+
+    @tvm.testing.requires_hexagon
+    def test_quantize(
+        self,
+        input_dtype,
+        output_dtype,
+        transformed_input_np,
+        input_shape,
+        expected_output_np,
+        transformed_expected_output_np,
+        input_crouton_layout,
+        output_layout,
+        hexagon_session,
+    ):
+        """Test quantize."""
+        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=input_dtype)
+
+        m_tensor = s1.quantize_compute(a_tensor, QUANTIZE_SCALE, QUANTIZE_ZERO_POINT, output_dtype)
+
+        tir_schedule = s1.tir_quantize_schedule(
+            m_tensor, a_tensor, input_crouton_layout, output_layout
+        )
+
+        sch = tir_schedule.mod
+
+        input_axis_separator = [4]
+        output_axis_separator = [4]
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [a_tensor, m_tensor],
+                get_hexagon_target("v69"),
+                name="quantize",
+            )
+
+        a_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=input_dtype,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        m_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_expected_output_np.shape,
+            dtype=output_dtype,
+            axis_separators=output_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(a_data_nd, m_data_nd)
+
+        b, h, weight, c = expected_output_np.shape
+
+        # convert nd to np and reshape to fixed chunk size layout
+        m_data_np = m_data_nd.numpy().reshape([b, h // 8, weight // 8, c // 32, 8, 8, 32])
+
+        np.testing.assert_allclose(transformed_expected_output_np, m_data_np, atol=1)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
index c0c6e7ca0fb4..7d7d80fc7f7c 100644
--- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
@@ -1,202 +1,202 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Resize 2D tesst.
-"""
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.topi.testing import resize2d_python
-import tvm.topi.hexagon as s1
-from tvm.contrib.hexagon import allocate_hexagon_array
-
-from ..infrastructure import transform_numpy, get_hexagon_target
-
-
-class TestResize2d:
-    """Test resize 2D class."""
-
-    (batch, channel, in_height, in_width, out_height, out_width,) = tvm.testing.parameters(
-        (
-            1,
-            32,
-            8,
-            8,
-            16,
-            16,
-        ),
-        (
-            1,
-            32,
-            48,
-            48,
-            8,
-            8,
-        ),
-    )
-
-    (layout, input_crouton_layout, output_layout, dtype,) = tvm.testing.parameters(
-        ("NHWC", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
-        ("NHWC", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
-    )
-
-    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
-    method = tvm.testing.parameter("nearest_neighbor", "linear")
-
-    @tvm.testing.fixture
-    def expected_output_np(
-        self,
-        input_np,
-        in_height,
-        in_width,
-        out_height,
-        out_width,
-        layout,
-        method,
-        coord_trans,
-    ):
-        """Generate expected output."""
-        scale_h = out_height / in_height
-        scale_w = out_width / in_width
-
-        return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans)
-
-    @tvm.testing.fixture
-    def input_np(self, input_shape, dtype):
-        if dtype == "float16":
-            return np.random.random(input_shape).astype(dtype)
-        if dtype == "uint8":
-            return np.random.randint(0, 255, input_shape).astype(dtype)
-        if dtype == "int8":
-            return np.random.randint(-128, 127, input_shape).astype(dtype)
-        raise RuntimeError(f"dtype {dtype} is not valid.")
-
-    @tvm.testing.fixture
-    def transformed_input_np(self, input_np, layout, input_crouton_layout, dtype):
-        if dtype in ["float16", "uint8", "int8"]:
-            return transform_numpy(input_np, layout.lower(), input_crouton_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def transformed_expected_output_np(self, expected_output_np, layout, output_layout, dtype):
-        if dtype in ["float16", "uint8", "int8"]:
-            return transform_numpy(expected_output_np, layout.lower(), output_layout)
-
-        raise RuntimeError(f"Unsupported data type '{dtype}'")
-
-    @tvm.testing.fixture
-    def input_shape(self, batch, channel, in_height, in_width):
-        return (batch, in_height, in_width, channel)
-
-    @tvm.testing.fixture
-    def output_shape(self, batch, channel, out_height, out_width):
-        return (batch, out_height, out_width, channel)
-
-    @tvm.testing.requires_hexagon
-    def test_resize2d(
-        self,
-        dtype,
-        input_np,
-        transformed_input_np,
-        input_shape,
-        output_shape,
-        expected_output_np,
-        transformed_expected_output_np,
-        layout,
-        input_crouton_layout,
-        output_layout,
-        coord_trans,
-        method,
-        hexagon_session,
-    ):
-        """Test resize 2D."""
-        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
-
-        m_tensor = s1.resize2d_compute(
-            a_tensor,
-            [0.0] * 4,
-            (output_shape[1], output_shape[2]),
-            layout=layout,
-            coordinate_transformation_mode=coord_trans,
-            method=method,
-            out_dtype=dtype,
-        )
-
-        tir_schedule = s1.tir_resize2d_schedule(
-            m_tensor, a_tensor, input_crouton_layout, output_layout
-        )
-
-        sch = tir_schedule.mod
-
-        input_axis_separator = [4]
-        if output_layout in (
-            "nhwc-8h2w32c2w-2d",
-            "nhwc-8h8w32c-2d",
-        ):
-            output_axis_separator = [4]
-        else:
-            raise RuntimeError(f"Unexpected layout '{output_layout}'")
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(
-                sch,
-                [a_tensor, m_tensor],
-                get_hexagon_target("v69"),
-                name="resize2d",
-            )
-
-        a_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            data=transformed_input_np,
-            dtype=dtype,
-            axis_separators=input_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        m_data_nd = allocate_hexagon_array(
-            hexagon_session.device,
-            transformed_expected_output_np.shape,
-            dtype=dtype,
-            axis_separators=output_axis_separator,
-            mem_scope="global.vtcm",
-        )
-
-        mod = hexagon_session.load_module(func)
-        mod(a_data_nd, m_data_nd)
-
-        batch_size, height, width, channel = output_shape
-        # convert nd to np and reshape to fixed chunk size layout
-        if output_layout == "nhwc-8h2w32c2w-2d":
-            m_data_np = m_data_nd.numpy().reshape(
-                [batch_size, height // 8, width // 4, channel // 32, 8, 2, 32, 2]
-            )
-        elif output_layout == "nhwc-8h8w32c-2d":
-            m_data_np = m_data_nd.numpy().reshape(
-                [batch_size, height // 8, width // 8, channel // 32, 8, 8, 32]
-            )
-
-        if dtype == "float16":
-            np.testing.assert_allclose(
-                transformed_expected_output_np, m_data_np, rtol=1e-3, atol=1e-3
-            )
-        elif dtype in ["int8", "uint8"]:
-            np.testing.assert_allclose(transformed_expected_output_np, m_data_np, rtol=1, atol=1)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Resize 2D tesst.
+"""
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm.topi.testing import resize2d_python
+import tvm.topi.hexagon as s1
+from tvm.contrib.hexagon import allocate_hexagon_array
+
+from ..infrastructure import transform_numpy, get_hexagon_target
+
+
+class TestResize2d:
+    """Test resize 2D class."""
+
+    (batch, channel, in_height, in_width, out_height, out_width,) = tvm.testing.parameters(
+        (
+            1,
+            32,
+            8,
+            8,
+            16,
+            16,
+        ),
+        (
+            1,
+            32,
+            48,
+            48,
+            8,
+            8,
+        ),
+    )
+
+    (layout, input_crouton_layout, output_layout, dtype,) = tvm.testing.parameters(
+        ("NHWC", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+        ("NHWC", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
+    )
+
+    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
+    method = tvm.testing.parameter("nearest_neighbor", "linear")
+
+    @tvm.testing.fixture
+    def expected_output_np(
+        self,
+        input_np,
+        in_height,
+        in_width,
+        out_height,
+        out_width,
+        layout,
+        method,
+        coord_trans,
+    ):
+        """Generate expected output."""
+        scale_h = out_height / in_height
+        scale_w = out_width / in_width
+
+        return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans)
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        if dtype == "float16":
+            return np.random.random(input_shape).astype(dtype)
+        if dtype == "uint8":
+            return np.random.randint(0, 255, input_shape).astype(dtype)
+        if dtype == "int8":
+            return np.random.randint(-128, 127, input_shape).astype(dtype)
+        raise RuntimeError(f"dtype {dtype} is not valid.")
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, layout, input_crouton_layout, dtype):
+        if dtype in ["float16", "uint8", "int8"]:
+            return transform_numpy(input_np, layout.lower(), input_crouton_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, layout, output_layout, dtype):
+        if dtype in ["float16", "uint8", "int8"]:
+            return transform_numpy(expected_output_np, layout.lower(), output_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def input_shape(self, batch, channel, in_height, in_width):
+        return (batch, in_height, in_width, channel)
+
+    @tvm.testing.fixture
+    def output_shape(self, batch, channel, out_height, out_width):
+        return (batch, out_height, out_width, channel)
+
+    @tvm.testing.requires_hexagon
+    def test_resize2d(
+        self,
+        dtype,
+        input_np,
+        transformed_input_np,
+        input_shape,
+        output_shape,
+        expected_output_np,
+        transformed_expected_output_np,
+        layout,
+        input_crouton_layout,
+        output_layout,
+        coord_trans,
+        method,
+        hexagon_session,
+    ):
+        """Test resize 2D."""
+        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
+
+        m_tensor = s1.resize2d_compute(
+            a_tensor,
+            [0.0] * 4,
+            (output_shape[1], output_shape[2]),
+            layout=layout,
+            coordinate_transformation_mode=coord_trans,
+            method=method,
+            out_dtype=dtype,
+        )
+
+        tir_schedule = s1.tir_resize2d_schedule(
+            m_tensor, a_tensor, input_crouton_layout, output_layout
+        )
+
+        sch = tir_schedule.mod
+
+        input_axis_separator = [4]
+        if output_layout in (
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h8w32c-2d",
+        ):
+            output_axis_separator = [4]
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [a_tensor, m_tensor],
+                get_hexagon_target("v69"),
+                name="resize2d",
+            )
+
+        a_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=dtype,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        m_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            transformed_expected_output_np.shape,
+            dtype=dtype,
+            axis_separators=output_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(a_data_nd, m_data_nd)
+
+        batch_size, height, width, channel = output_shape
+        # convert nd to np and reshape to fixed chunk size layout
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            m_data_np = m_data_nd.numpy().reshape(
+                [batch_size, height // 8, width // 4, channel // 32, 8, 2, 32, 2]
+            )
+        elif output_layout == "nhwc-8h8w32c-2d":
+            m_data_np = m_data_nd.numpy().reshape(
+                [batch_size, height // 8, width // 8, channel // 32, 8, 8, 32]
+            )
+
+        if dtype == "float16":
+            np.testing.assert_allclose(
+                transformed_expected_output_np, m_data_np, rtol=1e-3, atol=1e-3
+            )
+        elif dtype in ["int8", "uint8"]:
+            np.testing.assert_allclose(transformed_expected_output_np, m_data_np, rtol=1, atol=1)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
old mode 100755
new mode 100644
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
old mode 100755
new mode 100644
diff --git a/tests/python/unittest/test_transform_layout.py b/tests/python/unittest/test_transform_layout.py
old mode 100755
new mode 100644