From 6a7afebe3fbfacbfccbf4d6ba5f656870ccbbad1 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Sun, 28 Aug 2022 21:52:31 -0700
Subject: [PATCH 1/3] [Hexagon] Tests pylint

---
 tests/lint/pylint.sh                          |  17 +-
 .../contrib/test_hexagon/pytest_util.py       |  36 +-
 .../test_hexagon/test_async_dma_pipeline.py   | 510 ++++++++++--------
 .../test_hexagon/test_benchmark_maxpool2d.py  | 190 +++----
 .../test_fixed_point_conversion.py            |  13 +-
 .../test_hexagon/test_fixed_point_multiply.py | 167 +++---
 .../contrib/test_hexagon/test_memory_alloc.py |  22 +-
 .../test_hexagon/test_meta_schedule.py        | 138 +++--
 .../contrib/test_hexagon/test_parallel_hvx.py |  81 +--
 .../test_parallel_hvx_load_vtcm.py            | 224 ++++----
 .../test_hexagon/test_parallel_scalar.py      |  63 ++-
 .../contrib/test_hexagon/test_sigmoid.py      |  40 +-
 .../test_software_pipeline_async.py           | 269 ++++-----
 .../test_hexagon/test_vtcm_bandwidth.py       |  53 +-
 .../test_wo_qnn_canonicalization.py           |   5 +-
 15 files changed, 1002 insertions(+), 826 deletions(-)

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 6b5415987985..e41dc2bb80b8 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -27,20 +27,9 @@ python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
 
 # tests/python/contrib/test_hexagon tests
-python3 -m pylint tests/python/contrib/test_hexagon/benchmark_util.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/conftest.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/infrastructure.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_2d_physical_buffers.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_autotvm.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_cache_read_write.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_launcher.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_models.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_run_unit_tests.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_thread_pool.py --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint tests/python/contrib/test_hexagon/test_usmp.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/*.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/conv2d/*.py --rcfile="$(dirname "$0")"/pylintrc
+
 
 # tests/python/frontend tests
 python3 -m pylint tests/python/frontend/caffe/test_forward.py --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/contrib/test_hexagon/pytest_util.py b/tests/python/contrib/test_hexagon/pytest_util.py
index 77842ce91493..c078edf7a934 100644
--- a/tests/python/contrib/test_hexagon/pytest_util.py
+++ b/tests/python/contrib/test_hexagon/pytest_util.py
@@ -15,11 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pytest
-import numpy as np
-from typing import *
+""" Hexagon pytest utility functions """
+
+from typing import List, Optional, Union
 import collections
-import tvm.testing
+import numpy as np
 
 
 def get_test_id(*test_params, test_param_descs: List[Optional[str]] = None) -> str:
@@ -47,35 +47,35 @@ def get_test_id(*test_params, test_param_descs: List[Optional[str]] = None) -> s
         assert len(test_param_descs) == len(test_params)
 
     def get_single_param_chunk(param_val, param_desc: Optional[str]):
-        if type(param_val) == list:
+        if isinstance(param_val, list):
             # Like str(list), but avoid the whitespace padding.
             val_str = "[" + ",".join(str(x) for x in param_val) + "]"
             need_prefix_separator = False
 
-        elif type(param_val) == bool:
+        elif isinstance(param_val, bool):
             if param_val:
                 val_str = "T"
             else:
                 val_str = "F"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentConstant:
+        elif isinstance(param_val, TensorContentConstant):
             val_str = f"const[{param_val.elem_value}]"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentDtypeMin:
+        elif isinstance(param_val, TensorContentDtypeMin):
             val_str = "min"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentDtypeMax:
+        elif isinstance(param_val, TensorContentDtypeMax):
             val_str = "max"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentRandom:
+        elif isinstance(param_val, TensorContentRandom):
             val_str = "random"
             need_prefix_separator = True
 
-        elif type(param_val) == TensorContentSequentialCOrder:
+        elif isinstance(param_val, TensorContentSequentialCOrder):
             val_str = f"seqC[start:{param_val.start_value},inc:{param_val.increment}]"
             need_prefix_separator = True
 
@@ -148,26 +148,26 @@ def create_populated_numpy_ndarray(
     """
     itp = input_tensor_populator  # just for brevity
 
-    if type(itp) == TensorContentConstant:
+    if isinstance(itp, TensorContentConstant):
         return np.full(tuple(input_shape), itp.elem_value, dtype=dtype)
 
-    elif type(itp) == TensorContentDtypeMin:
+    elif isinstance(itp, TensorContentDtypeMin):
         info = get_numpy_dtype_info(dtype)
         return np.full(tuple(input_shape), info.min, dtype=dtype)
 
-    elif type(itp) == TensorContentDtypeMax:
+    elif isinstance(itp, TensorContentDtypeMax):
         info = get_numpy_dtype_info(dtype)
         return np.full(tuple(input_shape), info.max, dtype=dtype)
 
-    elif type(itp) == TensorContentRandom:
+    elif isinstance(itp, TensorContentRandom):
         return np.random.random(input_shape).astype(dtype)
 
-    elif type(itp) == TensorContentSequentialCOrder:
+    elif isinstance(itp, TensorContentSequentialCOrder):
         a = np.empty(tuple(input_shape), dtype)
 
-        with np.nditer(a, op_flags=["writeonly"], order="C") as it:
+        with np.nditer(a, op_flags=["writeonly"], order="C") as iterator:
             next_elem_val = itp.start_value
-            for elem in it:
+            for elem in iterator:
                 elem[...] = next_elem_val
                 next_elem_val += itp.increment
         return a
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 45e8eb0f68c6..c5830387160b 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -29,30 +29,35 @@
 
 
 def conv_approximation(size_a, size_w):
+    """Conv approximation."""
     a_shape = (size_a, VRMPY_SIZE_B)
     w_shape = (size_w, VRMPY_SIZE_B)
     out_shape = (size_a, VRMPY_SIZE_INT32)
 
     @T.prim_func
-    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+    def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, a_shape, dtype="uint8")
-        W = T.match_buffer(b, w_shape, dtype="uint8")
-        C = T.match_buffer(c, out_shape, dtype="int32")
-        for n, i in T.grid(size_a, size_w):
-            with T.block("C"):
-                vn, vi = T.axis.remap("SR", [n, i])
-                T.reads(A[vn, 0:VRMPY_SIZE_B], W[vi, 0:VRMPY_SIZE_B], C[vn, 0:VRMPY_SIZE_INT32])
-                T.writes(C[vn, 0:VRMPY_SIZE_INT32])
+        a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8")
+        w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8")
+        c_buffer = T.match_buffer(c_output, out_shape, dtype="int32")
+        for n, index_0 in T.grid(size_a, size_w):
+            with T.block("c_buffer"):
+                vn_index, vi_index = T.axis.remap("SR", [n, index_0])
+                T.reads(
+                    a_buffer[vn_index, 0:VRMPY_SIZE_B],
+                    w_buffer[vi_index, 0:VRMPY_SIZE_B],
+                    c_buffer[vn_index, 0:VRMPY_SIZE_INT32],
+                )
+                T.writes(c_buffer[vn_index, 0:VRMPY_SIZE_INT32])
                 with T.init():
                     for x in T.serial(VRMPY_SIZE_INT32):
-                        C[vn, x] = 0
-                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+                        c_buffer[vn_index, x] = 0
+                c_buffer[vn_index, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"),
                     T.uint32(3),
-                    C[vn, T.ramp(0, 1, 32)],
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(W[vi, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    c_buffer[vn_index, T.ramp(0, 1, 32)],
+                    T.reinterpret(a_buffer[vn_index, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(w_buffer[vi_index, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )
         # Currently async DMA lowering does not add any wait to the end of schedules so
@@ -73,13 +78,14 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 def evaluate(
     hexagon_session,
     sch,
-    a,
-    b,
-    c,
+    a_data,
+    b_data,
+    c_data,
     expected_output=None,
     use_async_copy=0,
     merge_async_commit_queue_scope=False,
 ):
+    """Evaluate function."""
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
     with tvm.transform.PassContext(
         config={
@@ -92,9 +98,9 @@ def evaluate(
         )
     module = hexagon_session.load_module(func_tir)
 
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
-    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+    a_hexagon = tvm.runtime.ndarray.array(a_data, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(b_data, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.ndarray.array(c_data, device=hexagon_session.device)
 
     if tvm.testing.utils.IS_IN_CI:
         # Run with reduced number and repeat for CI
@@ -108,32 +114,8 @@ def evaluate(
     return round(time.mean * 1000, 4)
 
 
-@tvm.testing.fixture
-def input_a(size_a):
-    return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
-
-
-@tvm.testing.fixture
-def input_w(size_w):
-    return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8")
-
-
-@tvm.testing.fixture
-def expected_output(size_a, size_w, input_a, input_w):
-    if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1):
-        pytest.skip("Skipping test since it takes too long in CI.")
-    expected_output = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32")
-    for n in range(size_a):
-        for x in range(size_w):
-            for i in range(VRMPY_SIZE_INT32):
-                for r in range(4):
-                    expected_output[n, i] += np.uint32(input_a[n, i * 4 + r]) * np.uint32(
-                        input_w[x, i * 4 + r]
-                    )
-    return expected_output
-
-
 def get_single_dma_schedule(size_a, size_w):
+    """Generate single DMA schedule."""
     a_shape = (size_a, VRMPY_SIZE_B)
     w_shape = (size_w, VRMPY_SIZE_B)
     out_shape = (size_a, VRMPY_SIZE_INT32)
@@ -142,32 +124,32 @@ def get_single_dma_schedule(size_a, size_w):
     w_bytes = size_w * VRMPY_SIZE_B
 
     @T.prim_func
-    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+    def operator(a_input: T.handle, b_input: T.handle, c_output: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, a_shape, dtype="uint8", mem_scope="global")
-        W = T.match_buffer(b, w_shape, dtype="uint8", mem_scope="global")
-        C = T.match_buffer(c, out_shape, dtype="int32", mem_scope="global")
-        A_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm")
-        W_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm")
-        C_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm")
+        a_buffer = T.match_buffer(a_input, a_shape, dtype="uint8", mem_scope="global")
+        w_buffer = T.match_buffer(b_input, w_shape, dtype="uint8", mem_scope="global")
+        c_buffer = T.match_buffer(c_output, out_shape, dtype="int32", mem_scope="global")
+        a_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm")
+        w_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm")
+        c_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm")
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    A_global_vtcm.data,
+                    a_global_vtcm.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    A_global_vtcm.dtype,
+                    a_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    A.data,
+                    a_buffer.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    A.dtype,
+                    a_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -179,20 +161,20 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    W_global_vtcm.data,
+                    w_global_vtcm.data,
                     T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    W_global_vtcm.dtype,
+                    w_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    W.data,
+                    w_buffer.data,
                     T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    W.dtype,
+                    w_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -200,43 +182,43 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
                 dtype="int32",
             )
         )
-        for n, i in T.grid(size_a, size_w):
-            with T.block("C"):
-                vn, vi = T.axis.remap("SR", [n, i])
+        for n, index_0 in T.grid(size_a, size_w):
+            with T.block("c_buffer"):
+                vn_index, vi_index = T.axis.remap("SR", [n, index_0])
                 T.reads(
-                    A_global_vtcm[vn, 0:VRMPY_SIZE_B],
-                    W_global_vtcm[vi, 0:VRMPY_SIZE_B],
-                    C_global_vtcm[vn, 0:VRMPY_SIZE_INT32],
+                    a_global_vtcm[vn_index, 0:VRMPY_SIZE_B],
+                    w_global_vtcm[vi_index, 0:VRMPY_SIZE_B],
+                    c_global_vtcm[vn_index, 0:VRMPY_SIZE_INT32],
                 )
-                T.writes(C_global_vtcm[vn, 0:VRMPY_SIZE_INT32])
+                T.writes(c_global_vtcm[vn_index, 0:VRMPY_SIZE_INT32])
                 with T.init():
                     for x in T.serial(VRMPY_SIZE_INT32):
-                        C_global_vtcm[vn, x] = 0
-                C_global_vtcm[vn, T.ramp(0, 1, 32)] += T.call_llvm_intrin(
+                        c_global_vtcm[vn_index, x] = 0
+                c_global_vtcm[vn_index, T.ramp(0, 1, 32)] += T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A_global_vtcm[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(W_global_vtcm[vi, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_global_vtcm[vn_index, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(w_global_vtcm[vi_index, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    C.data,
+                    c_buffer.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    C.dtype,
+                    c_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    C_global_vtcm.data,
+                    c_global_vtcm.data,
                     T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"),
                     0,
                     2,
-                    C_global_vtcm.dtype,
+                    c_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -251,43 +233,45 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
+    """Generate fake conv schedule with VTCM."""
     sch = conv_approximation(size_a, size_w)
 
-    compute_block = sch.get_block("C")
+    compute_block = sch.get_block("c_buffer")
     sch.cache_read(compute_block, 1, "global.vtcm")
 
     n = sch.get_loops(compute_block)[0]
-    no, _ = sch.split(n, [blocks, None])
+    n_outer, _ = sch.split(n, [blocks, None])
 
     cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm")
-    sch.compute_at(cache_read_block_a, no)
+    sch.compute_at(cache_read_block_a, n_outer)
     sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
 
     cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
-    sch.reverse_compute_at(cache_write_block_c, no)
+    sch.reverse_compute_at(cache_write_block_c, n_outer)
     sch.fuse(*sch.get_loops(cache_write_block_c)[1:])
 
     return sch
 
 
 def get_multi_input_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
+    """Generate multi input fake Conv using VTCM."""
     sch = conv_approximation(size_a, size_w)
 
-    compute_block = sch.get_block("C")
+    compute_block = sch.get_block("c_buffer")
 
     n = sch.get_loops(compute_block)[0]
-    no, _ = sch.split(n, [blocks, None])
+    n_outer, _ = sch.split(n, [blocks, None])
 
     cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm")
-    sch.compute_at(cache_read_block_a, no)
+    sch.compute_at(cache_read_block_a, n_outer)
     sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
 
     cache_read_block_b = sch.cache_read(compute_block, 1, "global.vtcm")
-    sch.compute_at(cache_read_block_b, no)
+    sch.compute_at(cache_read_block_b, n_outer)
     sch.fuse(*sch.get_loops(cache_read_block_b)[1:])
 
     cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
-    sch.reverse_compute_at(cache_write_block_c, no)
+    sch.reverse_compute_at(cache_write_block_c, n_outer)
     sch.fuse(*sch.get_loops(cache_write_block_c)[1:])
 
     return sch
@@ -301,6 +285,8 @@ def print_results(test_key, runtimes):
 
 
 class TestAsyncDMAPipeline:
+    """Async DMA pipeline test class."""
+
     # Removed most of these to speedup CI.
     size_a = tvm.testing.parameter(
         1024,
@@ -314,6 +300,29 @@ class TestAsyncDMAPipeline:
         9 * 9,
     )
 
+    @tvm.testing.fixture
+    def input_a(self, size_a):
+        return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
+
+    @tvm.testing.fixture
+    def input_w(self, size_w):
+        return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8")
+
+    @tvm.testing.fixture
+    def expected_output(self, size_a, size_w, input_a, input_w):
+        """Generate expected output."""
+        if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1):
+            pytest.skip("Skipping test since it takes too long in CI.")
+        expected_result = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32")
+        for n in range(size_a):
+            for x in range(size_w):
+                for index_0 in range(VRMPY_SIZE_INT32):
+                    for r_index in range(4):
+                        expected_result[n, index_0] += np.uint32(
+                            input_a[n, index_0 * 4 + r_index]
+                        ) * np.uint32(input_w[x, index_0 * 4 + r_index])
+        return expected_result
+
     @tvm.testing.requires_hexagon
     def test_loading_vtcm_for_vrmpy(
         self,
@@ -324,6 +333,7 @@ def test_loading_vtcm_for_vrmpy(
         input_w,
         expected_output,
     ):
+        """VTCM for VRMPY test."""
 
         if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1):
             pytest.skip("Skipping test since it takes too long in CI.")
@@ -350,7 +360,7 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0])
@@ -365,11 +375,11 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0, 2])
-        async_input_output_runtime = evaluate(
+        async_input_output = evaluate(
             hexagon_session,
             sch,
             input_a,
@@ -380,11 +390,11 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 3, 6])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0, 6])
-        async_input_output_runtime_larger_buffers = evaluate(
+        async_larger_buffers = evaluate(
             hexagon_session,
             sch,
             input_a,
@@ -395,11 +405,11 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_multi_input_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 0, 1, 2])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2, 3])
         sch.annotate(n, "software_pipeline_async_stages", [0, 2])
-        async_multi_input_output_runtime = evaluate(
+        async_multi_input_output = evaluate(
             hexagon_session,
             sch,
             input_a,
@@ -411,7 +421,7 @@ def test_loading_vtcm_for_vrmpy(
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
-        n = sch.get_loops(sch.get_block("C"))[0]
+        n = sch.get_loops(sch.get_block("c_buffer"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [2])
@@ -435,22 +445,30 @@ def test_loading_vtcm_for_vrmpy(
             expected_output,
         )
 
-        # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128
+        # Total transfer size is equal to the size of
+        # a_buffer + w_buffer + c_buffer which is equal to 2 * size_a * 128 + size_w * 128
         transfer_mb = round((2 * size_a * VRMPY_SIZE_B + size_w * VRMPY_SIZE_B) / 1e6, 2)
 
-        # Total number of operations can be calculated given the total number of vrmpy calls (size_a * size_w) * operations per vrmpy accumulate (128 multiplies + 3 adds for reduction per lane + 1 add for accumulate per lane)
+        # Total number of operations can be calculated given
+        # the total number of vrmpy calls (size_a * size_w) * operations
+        # per vrmpy accumulate (128 multiplies + 3 adds for reduction
+        # per lane + 1 add for accumulate per lane)
         complexity = round(size_a * size_w * (VRMPY_SIZE_B * 4) / 1e9, 3)
         print_results(
-            f"Test with A.size: {size_a * VRMPY_SIZE_B}, W.size: {size_w * VRMPY_SIZE_B}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...",
+            (
+                f"Test with a_buffer.size: {size_a * VRMPY_SIZE_B}, w_buffer.size:"
+                f" {size_w * VRMPY_SIZE_B}, computational complexity of {complexity} GOPs"
+                f", and total memory transfer of {transfer_mb} MB..."
+            ),
             {
                 "without_vtcm": base_runtime,
                 "synchronous_dma": single_dma_runtime,
                 "base_vtcm": base_vtcm_runtime,
                 "async_dma_input": async_input_runtime,
                 "async_dma_output": async_output_runtime,
-                "async_dma_input_output": async_input_output_runtime,
-                "async_dma_multi_input_output": async_multi_input_output_runtime,
-                "async_input_output_runtime_larger_buffers": async_input_output_runtime_larger_buffers,
+                "async_dma_input_output": async_input_output,
+                "async_dma_multi_input_output": async_multi_input_output,
+                "async_input_output_runtime_larger_buffers": async_larger_buffers,
             },
         )
 
@@ -458,84 +476,102 @@ def test_loading_vtcm_for_vrmpy(
 # from tvm.script import tir as T
 @tvm.script.ir_module
 class ModulePipelined:
+    """Pipelined module class."""
+
+    # pylint: disable=no-self-argument
     @T.prim_func
     def main(
-        p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
-        p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
-        T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
+        p0_buffer: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
+        p1_buffer: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
+        t_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
     ) -> None:
+        # pylint: disable=missing-function-docstring
         # function attr dict
         T.func_attr({"tir.noalias": True, "global_symbol": "main"})
         # body
         # with T.block("root")
-        conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32", scope="global.vtcm")
+        conv2d_nchwc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32", scope="global.vtcm")
         p0_global_vtcm = T.alloc_buffer([1, 1, 230, 230, 4], dtype="uint8", scope="global.vtcm")
         p1_global_vtcm = T.alloc_buffer([2, 1, 7, 7, 1, 32, 4], dtype="int8", scope="global.vtcm")
         for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(2, 1, 7, 7, 1, 32, 4):
             with T.block("p1_global.vtcm"):
-                v0, v1, v2, v3, v4, v5, v6 = T.axis.remap(
+                v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind = T.axis.remap(
                     "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6]
                 )
-                T.reads(p1[v0, v1, v2, v3, v4, v5, v6])
-                T.writes(p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6])
-                p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = p1[v0, v1, v2, v3, v4, v5, v6]
-        for po in T.serial(4):
-            for i in T.serial(55876):
+                T.reads(p1_buffer[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind])
+                T.writes(p1_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind])
+                p1_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind] = p1_buffer[
+                    v0_ind, v1_ind, v2_ind, v3_ind, v4_ind, v5_ind, v6_ind
+                ]
+        for p_outer in T.serial(4):
+            for index_0 in T.serial(55876):
                 with T.block("p0_global.vtcm"):
-                    v0 = T.axis.spatial(1, 0)
-                    v1 = T.axis.spatial(1, 0)
-                    v2 = T.axis.spatial(230, po * 56 + i // 916)
-                    v3 = T.axis.spatial(230, i % 916 // 4)
-                    v4 = T.axis.spatial(4, i % 4)
-                    T.reads(p0[v0, v1, v2, v3, v4])
-                    T.writes(p0_global_vtcm[v0, v1, v2, v3, v4])
-                    p0_global_vtcm[v0, v1, v2, v3, v4] = p0[v0, v1, v2, v3, v4]
-            for i in T.parallel(28):
-                for ii, iii, iiii in T.grid(2, 14, 8):
+                    v0_ind = T.axis.spatial(1, 0)
+                    v1_ind = T.axis.spatial(1, 0)
+                    v2_ind = T.axis.spatial(230, p_outer * 56 + index_0 // 916)
+                    v3_ind = T.axis.spatial(230, index_0 % 916 // 4)
+                    v4_ind = T.axis.spatial(4, index_0 % 4)
+                    T.reads(p0_buffer[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind])
+                    T.writes(p0_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind])
+                    p0_global_vtcm[v0_ind, v1_ind, v2_ind, v3_ind, v4_ind] = p0_buffer[
+                        v0_ind, v1_ind, v2_ind, v3_ind, v4_ind
+                    ]
+            for index_0 in T.parallel(28):
+                for index_1, index_2, index_3 in T.grid(2, 14, 8):
                     with T.block("conv2d_NCHWc_int8_o_init"):
                         n = T.axis.spatial(1, 0)
-                        oc_chunk = T.axis.spatial(2, ii)
-                        oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + iii)
-                        ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + iiii)
-                        oc_block_o = T.axis.spatial(1, 0)
+                        oc_chunk = T.axis.spatial(2, index_1)
+                        o_height = T.axis.spatial(
+                            112, (p_outer * 28 + index_0) // 14 * 14 + index_2
+                        )
+                        o_width = T.axis.spatial(112, (p_outer * 28 + index_0) % 14 * 8 + index_3)
+                        oc_block_o = T.axis.spatial(1, 0)  # pylint: disable=unused-variable
                         T.reads()
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                        T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32])
                         for i4_1 in T.vectorized(32):
                             with T.block("conv2d_NCHWc_int8_init"):
                                 oc_block_i_init = T.axis.spatial(32, i4_1)
                                 T.reads()
-                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
-                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                                T.writes(
+                                    conv2d_nchwc_int8[
+                                        n, oc_chunk, o_height, o_width, oc_block_i_init
+                                    ]
+                                )
+                                conv2d_nchwc_int8[
+                                    n, oc_chunk, o_height, o_width, oc_block_i_init
+                                ] = 0
                 for i1_1, i5_1, i6_1, i2_2, i3_2 in T.grid(2, 7, 7, 14, 8):
                     with T.block("conv2d_NCHWc_int8_o_update"):
                         n = T.axis.spatial(1, 0)
                         oc_chunk = T.axis.spatial(2, i1_1)
-                        oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + i2_2)
-                        ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + i3_2)
-                        oc_block_o = T.axis.spatial(1, 0)
-                        kh = T.axis.reduce(7, i5_1)
-                        kw = T.axis.reduce(7, i6_1)
+                        o_height = T.axis.spatial(112, (p_outer * 28 + index_0) // 14 * 14 + i2_2)
+                        o_width = T.axis.spatial(112, (p_outer * 28 + index_0) % 14 * 8 + i3_2)
+                        oc_block_o = T.axis.spatial(1, 0)  # pylint: disable=unused-variable
+                        k_height = T.axis.reduce(7, i5_1)
+                        k_width = T.axis.reduce(7, i6_1)
                         ic_outer = T.axis.reduce(1, 0)
                         ic_f_inner = T.axis.reduce(1, 0)
-                        ic_s_inner_o = T.axis.reduce(1, 0)
+                        ic_s_inner_o = T.axis.reduce(1, 0)  # pylint: disable=unused-variable
                         T.reads(
-                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                            conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32],
                             p0_global_vtcm[
                                 n,
                                 ic_outer,
-                                oh * 2 + kh,
-                                ow * 2 + kw,
+                                o_height * 2 + k_height,
+                                o_width * 2 + k_width,
                                 ic_f_inner * 4 : ic_f_inner * 4 + 4,
                             ],
-                            p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                            p1_global_vtcm[
+                                oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4
+                            ],
                         )
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
-                        A = T.match_buffer(
+                        T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32])
+                        a_buffer = T.match_buffer(
                             p0_global_vtcm[
                                 n,
                                 ic_outer,
-                                oh * 2 + kh,
-                                ow * 2 + kw,
+                                o_height * 2 + k_height,
+                                o_width * 2 + k_width,
                                 ic_f_inner * 4 : ic_f_inner * 4 + 4,
                             ],
                             [4],
@@ -543,42 +579,48 @@ def main(
                             offset_factor=1,
                             scope="global.vtcm",
                         )
-                        B = T.match_buffer(
-                            p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                        b_buffer = T.match_buffer(
+                            p1_global_vtcm[
+                                oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4
+                            ],
                             [32, 4],
                             dtype="int8",
                             offset_factor=1,
                             scope="global.vtcm",
                         )
-                        C = T.match_buffer(
-                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                        c_buffer = T.match_buffer(
+                            conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32],
                             [32],
                             dtype="int32",
                             offset_factor=1,
                             scope="global.vtcm",
                         )
-                        A_u8x4: T.uint8x4 = A[0:4]
-                        A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                        B_i8x128 = B[0, 0:128]
-                        B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
-                        C[0:32] = T.call_llvm_pure_intrin(
+                        a_u8x4: T.uint8x4 = a_buffer[0:4]
+                        a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32")
+                        b_i8x128 = b_buffer[0, 0:128]
+                        b_i32x32: T.int32x32 = T.reinterpret(b_i8x128, dtype="int32x32")
+                        c_buffer[0:32] = T.call_llvm_pure_intrin(
                             4217,
                             T.uint32(3),
-                            C[0:32],
-                            T.broadcast(A_i32, 32),
-                            B_i32x32,
+                            c_buffer[0:32],
+                            T.broadcast(a_i32, 32),
+                            b_i32x32,
                             dtype="int32x32",
                         )
-            for i in T.serial(200704):
-                with T.block("conv2d_NCHWc_int8.vtcm"):
+            for index_0 in T.serial(200704):
+                with T.block("conv2d_nchwc_int8.vtcm"):
                     ax0_1 = T.axis.spatial(1, 0)
-                    ax1_1 = T.axis.spatial(2, i % 7168 // 3584)
-                    ax2_1 = T.axis.spatial(112, (po * 28 + i // 7168) // 14 * 14 + i % 3584 // 256)
-                    ax3_1 = T.axis.spatial(112, (po * 28 + i // 7168) % 14 * 8 + i % 256 // 32)
-                    ax4 = T.axis.spatial(32, i % 32)
-                    T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                    T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                    T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[
+                    ax1_1 = T.axis.spatial(2, index_0 % 7168 // 3584)
+                    ax2_1 = T.axis.spatial(
+                        112, (p_outer * 28 + index_0 // 7168) // 14 * 14 + index_0 % 3584 // 256
+                    )
+                    ax3_1 = T.axis.spatial(
+                        112, (p_outer * 28 + index_0 // 7168) % 14 * 8 + index_0 % 256 // 32
+                    )
+                    ax4 = T.axis.spatial(32, index_0 % 32)
+                    T.reads(conv2d_nchwc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                    T.writes(t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                    t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_nchwc_int8[
                         ax0_1, ax1_1, ax2_1, ax3_1, ax4
                     ]
 
@@ -586,122 +628,138 @@ def main(
 # from tvm.script import tir as T
 @tvm.script.ir_module
 class ModuleBase:
+    """Base module test class."""
+
+    # pylint: disable=no-self-argument
     @T.prim_func
     def main(
-        p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
-        p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
-        T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
+        p0_buffer: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
+        p1_buffer: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
+        t_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
     ) -> None:
+        # pylint: disable=missing-function-docstring
         # function attr dict
         T.func_attr({"tir.noalias": True, "global_symbol": "main"})
         # buffer definition
         # body
         # with T.block("root")
-        conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32")
+        conv2d_nchwc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32")
         for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(
             112, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}
         ):
-            for i4_0_0 in T.serial(1):
+            for i4_0_0 in T.serial(1):  # pylint: disable=unused-variable
                 for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid(
                     2, 1, 1, 1, 14, 8
                 ):
                     with T.block("conv2d_NCHWc_int8_o_init"):
                         n = T.axis.spatial(1, 0)
                         oc_chunk = T.axis.spatial(2, i1_1_init + i1_2_init)
-                        oh = T.axis.spatial(
+                        o_height = T.axis.spatial(
                             112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init
                         )
-                        ow = T.axis.spatial(
+                        o_width = T.axis.spatial(
                             112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init
                         )
-                        oc_block_o = T.axis.spatial(1, 0)
+                        oc_block_o = T.axis.spatial(1, 0)  # pylint: disable=unused-variable
                         T.reads()
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                        T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32])
                         for i4_1 in T.vectorized(32):
                             with T.block("conv2d_NCHWc_int8_init"):
                                 oc_block_i_init = T.axis.spatial(32, i4_1)
                                 T.reads()
-                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
-                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
-                for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(1, 1, 1, 1, 1):
+                                T.writes(
+                                    conv2d_nchwc_int8[
+                                        n, oc_chunk, o_height, o_width, oc_block_i_init
+                                    ]
+                                )
+                                conv2d_nchwc_int8[
+                                    n, oc_chunk, o_height, o_width, oc_block_i_init
+                                ] = 0
+                for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(  # pylint: disable=unused-variable
+                    1, 1, 1, 1, 1
+                ):  # pylint: disable=unused-variable
                     for (
-                        i0_1,
+                        i0_1,  # pylint: disable=unused-variable
                         i1_1,
                         i2_1,
                         i3_1,
-                        i4_0_1,
+                        i4_0_1,  # pylint: disable=unused-variable
                         i5_1,
                         i6_1,
-                        i7_1,
-                        i8_1,
-                        i9_0_1,
-                        i0_2,
+                        i7_1,  # pylint: disable=unused-variable
+                        i8_1,  # pylint: disable=unused-variable
+                        i9_0_1,  # pylint: disable=unused-variable
+                        i0_2,  # pylint: disable=unused-variable
                         i1_2,
                         i2_2,
                         i3_2,
-                        i4_0_2,
+                        i4_0_2,  # pylint: disable=unused-variable
                     ) in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1):
                         with T.block("conv2d_NCHWc_int8_o_update"):
                             n = T.axis.spatial(1, 0)
                             oc_chunk = T.axis.spatial(2, i1_1 + i1_2)
-                            oh = T.axis.spatial(
+                            o_height = T.axis.spatial(
                                 112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2
                             )
-                            ow = T.axis.spatial(
+                            o_width = T.axis.spatial(
                                 112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2
                             )
-                            oc_block_o = T.axis.spatial(1, 0)
-                            kh = T.axis.reduce(7, i5_0 * 7 + i5_1)
-                            kw = T.axis.reduce(7, i6_0 * 7 + i6_1)
+                            oc_block_o = T.axis.spatial(1, 0)  # pylint: disable=unused-variable
+                            k_height = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                            k_width = T.axis.reduce(7, i6_0 * 7 + i6_1)
                             ic_outer = T.axis.reduce(1, 0)
                             ic_f_inner = T.axis.reduce(1, 0)
-                            ic_s_inner_o = T.axis.reduce(1, 0)
+                            ic_s_inner_o = T.axis.reduce(1, 0)  # pylint: disable=unused-variable
                             T.reads(
-                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
-                                p0[
+                                conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32],
+                                p0_buffer[
                                     n,
                                     ic_outer,
-                                    oh * 2 + kh,
-                                    ow * 2 + kw,
+                                    o_height * 2 + k_height,
+                                    o_width * 2 + k_width,
                                     ic_f_inner * 4 : ic_f_inner * 4 + 4,
                                 ],
-                                p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                                p1_buffer[
+                                    oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4
+                                ],
                             )
-                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
-                            A = T.match_buffer(
-                                p0[
+                            T.writes(conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32])
+                            a_buffer = T.match_buffer(
+                                p0_buffer[
                                     n,
                                     ic_outer,
-                                    oh * 2 + kh,
-                                    ow * 2 + kw,
+                                    o_height * 2 + k_height,
+                                    o_width * 2 + k_width,
                                     ic_f_inner * 4 : ic_f_inner * 4 + 4,
                                 ],
                                 [4],
                                 dtype="uint8",
                                 offset_factor=1,
                             )
-                            B = T.match_buffer(
-                                p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                            b_buffer = T.match_buffer(
+                                p1_buffer[
+                                    oc_chunk, ic_outer, k_height, k_width, ic_f_inner, 0:32, 0:4
+                                ],
                                 [32, 4],
                                 dtype="int8",
                                 offset_factor=1,
                             )
-                            C = T.match_buffer(
-                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                            c_buffer = T.match_buffer(
+                                conv2d_nchwc_int8[n, oc_chunk, o_height, o_width, 0:32],
                                 [32],
                                 dtype="int32",
                                 offset_factor=1,
                             )
-                            A_u8x4: T.uint8x4 = A[0:4]
-                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                            B_i8x128 = B[0, 0:128]
-                            B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
-                            C[0:32] = T.call_llvm_pure_intrin(
+                            a_u8x4: T.uint8x4 = a_buffer[0:4]
+                            a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32")
+                            b_i8x128 = b_buffer[0, 0:128]
+                            b_i32x32: T.int32x32 = T.reinterpret(b_i8x128, dtype="int32x32")
+                            c_buffer[0:32] = T.call_llvm_pure_intrin(
                                 4217,
                                 T.uint32(3),
-                                C[0:32],
-                                T.broadcast(A_i32, 32),
-                                B_i32x32,
+                                c_buffer[0:32],
+                                T.broadcast(a_i32, 32),
+                                b_i32x32,
                                 dtype="int32x32",
                             )
                     for ax0, ax1, ax2, ax3 in T.grid(1, 2, 14, 8):
@@ -715,44 +773,50 @@ def main(
                                     112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3
                                 )
                                 ax4 = T.axis.spatial(32, ax4_fused)
-                                T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                                T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[
+                                T.reads(conv2d_nchwc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T.writes(t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                t_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_nchwc_int8[
                                     ax0_1, ax1_1, ax2_1, ax3_1, ax4
                                 ]
 
 
 @tvm.testing.requires_hexagon
 def test_meta(hexagon_session):
+    """Test meta."""
     if tvm.testing.utils.IS_IN_CI:
         pytest.skip("Skipping test since it takes too long in CI.")
 
-    a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
-    w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
-    c = np.zeros((1, 2, 112, 112, 32), dtype="int32")
+    a_data = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
+    w_data = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
+    c_data = np.zeros((1, 2, 112, 112, 32), dtype="int32")
 
     sch = tvm.tir.Schedule(ModuleBase)
-    base_runtime = evaluate(hexagon_session, sch, a, w, c)
+    base_runtime = evaluate(hexagon_session, sch, a_data, w_data, c_data)
 
     sch = tvm.tir.Schedule(ModulePipelined)
     compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
-    o = sch.get_loops(compute_block)[0]
+    outer = sch.get_loops(compute_block)[0]
 
-    unscheduled_vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+    unscheduled_vtcm_runtime = evaluate(
+        hexagon_session, sch, a_data, w_data, c_data, use_async_copy=1
+    )
 
     sch = tvm.tir.Schedule(ModulePipelined)
     compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
-    o = sch.get_loops(compute_block)[0]
+    outer = sch.get_loops(compute_block)[0]
 
-    sch.annotate(o, "software_pipeline_stage", [0, 1, 2])
-    sch.annotate(o, "software_pipeline_order", [0, 1, 2])
-    sch.annotate(o, "software_pipeline_async_stages", [0, 2])
+    sch.annotate(outer, "software_pipeline_stage", [0, 1, 2])
+    sch.annotate(outer, "software_pipeline_order", [0, 1, 2])
+    sch.annotate(outer, "software_pipeline_async_stages", [0, 2])
 
-    pipeline_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+    pipeline_runtime = evaluate(hexagon_session, sch, a_data, w_data, c_data, use_async_copy=1)
 
-    transfer_mb = round((a.size + w.size + c.size) / 1e6, 2)
+    transfer_mb = round((a_data.size + w_data.size + c_data.size) / 1e6, 2)
     print_results(
-        f"Test with A.size: {a.size}, W.size: {w.size}, and total memory transfer of {transfer_mb} MB...",
+        (
+            f"Test with a_buffer.size: {a_data.size}, w_buffer.size: {w_data.size}"
+            f", and total memory transfer of {transfer_mb} MB..."
+        ),
         {
             "without_vtcm": base_runtime,
             "unscheduled_vtcm_runtime": unscheduled_vtcm_runtime,
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
index 7e8a6d79f492..24d1a3f788cf 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -28,7 +28,7 @@
     - Testing parameters (input shapes, dtypes, etc.) currently
       support only one value for each parameter.
 
-    - H, W, C must be integer multiples of 8, 8, and 32,
+    - height, width, channel must be integer multiples of 8, 8, and 32,
       respectively.  I.e., partial blocks aren't currently
       supported by this script.
 
@@ -42,23 +42,25 @@
       primfuncs and demonstrate more coding strategies.
 """
 
-import pytest
-import numpy as np
+from typing import List
 import copy
 import os
 
+import pytest
+import numpy as np
+
 import tvm.testing
 from tvm import te, topi, tir
 from tvm.topi import testing
 from tvm.contrib.hexagon.session import Session
-from typing import List
+
 
 from .infrastructure import allocate_hexagon_array, get_hexagon_target
 from . import benchmark_util as bu
 
 # Pytest seems to require that fixture names exist in the current module.
 # E.g., it doesn't allow: @pytest.mark.usefixtures("bu.benchmark_group")
-benchmark_group = bu.benchmark_group
+BENCHMARK_GROUP = bu.benchmark_group
 
 _SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_benchmarks_flag_and_reason()
 
@@ -67,25 +69,25 @@ def _ceil_div(numerator, denominator):
     return (numerator + (denominator - 1)) // denominator
 
 
-def _int8_nhwc_8h8w32c_map(n, h, w, c):
+def _int8_nhwc_8h8w32c_map(n_batch, height, width, channel):
     return [
-        n,
-        h // 8,
-        w // 8,
-        c // 32,
+        n_batch,
+        height // 8,
+        width // 8,
+        channel // 32,
         te.AXIS_SEPARATOR,
-        h % 8,
-        w % 8,
-        c % 32,
+        height % 8,
+        width % 8,
+        channel % 32,
     ]
 
 
-def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]:
+def _int8_nhwc_8h8w32c_shape(n_batch, height, width, channel) -> List[int]:
     return [
-        n,
-        _ceil_div(h, 8),
-        _ceil_div(w, 8),
-        _ceil_div(c, 32),
+        n_batch,
+        _ceil_div(height, 8),
+        _ceil_div(width, 8),
+        _ceil_div(channel, 32),
         8,
         8,
         32,
@@ -100,10 +102,10 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
     stage1 = copy.copy(arr_in)
 
     (
-        n,
-        h,
-        w,
-        c,
+        n_batch,
+        height,
+        width,
+        channel,
     ) = stage1.shape
 
     (
@@ -112,9 +114,9 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
         c_minor,
     ) = [8, 8, 32]
 
-    h_major = _ceil_div(h, h_minor)
-    w_major = _ceil_div(w, w_minor)
-    c_major = _ceil_div(c, c_minor)
+    h_major = _ceil_div(height, h_minor)
+    w_major = _ceil_div(width, w_minor)
+    c_major = _ceil_div(channel, c_minor)
 
     # This handles cases where the dimensions of arr_in are not cleanly divided
     # by the minor block size, i.e. [8, 8, 32].
@@ -122,10 +124,12 @@ def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
     # Any additional array elements that this creates will ahve value 0.
     # We shouldn't actually care what value is used for those elements, because they
     # shouldn't be treated as meaningful by any of our algorithms.
-    if (h % h_minor) or (w % w_minor) or (c % c_minor):
-        stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False)
+    if (height % h_minor) or (width % w_minor) or (channel % c_minor):
+        stage1.resize(
+            (n_batch, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False
+        )
 
-    stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor)
+    stage2 = stage1.reshape(n_batch, h_major, h_minor, w_major, w_minor, c_major, c_minor)
     stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6)
     return stage3
 
@@ -137,8 +141,10 @@ def _create_test_input(shape, dtype: str) -> np.ndarray:
     return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8)
 
 
-@pytest.mark.usefixtures("benchmark_group")
+@pytest.mark.usefixtures("BENCHMARK_GROUP")
 class TestMaxPool2D:
+    """maxpool2D base test class"""
+
     csv_column_order = [
         # Identifies which TE-compute / TIRScript is used as the basis for the
         # benchmarked primfunc. Only needs to be meaningful to humans.
@@ -150,12 +156,12 @@ class TestMaxPool2D:
         # Values directly based on test parameters...
         "input_shape_4d",
         "block_shape",
-        "DTYPE",
-        "KERNEL",
-        "STRIDE",
-        "DILATION",
-        "PADDING",
-        "IO_TENSOR_MEM_SCOPE",
+        "dtype",
+        "kernel",
+        "stride",
+        "dilation",
+        "padding",
+        "io_tensor_mem_scope",
         # Reserved columns defined by the BenchmarksTable class.
         "row_status",
         "timings_min_usecs",
@@ -170,48 +176,50 @@ class TestMaxPool2D:
         "comments",
     ]
 
-    DTYPE = tvm.testing.parameter("int8")
+    dtype = tvm.testing.parameter("int8")
 
-    # FIXME(cconvey): The script currently fails when H, W, or C is not an
+    # FIXME(cconvey): The script currently fails when height, width, or channel is not an
     # integer multiple of 8, 8, or 32, respectively.
-    N = tvm.testing.parameter(1)
-    H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
-    W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
-    C = tvm.testing.parameter(*[x * 32 for x in [1, 2]])
+    n_batch = tvm.testing.parameter(1)
+    height = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    width = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    channel = tvm.testing.parameter(*[x * 32 for x in [1, 2]])
 
-    KERNEL = tvm.testing.parameter((1, 1), (3, 3))
-    STRIDE = tvm.testing.parameter((1, 1))
-    DILATION = tvm.testing.parameter((1, 1))
-    PADDING = tvm.testing.parameter((0, 0, 0, 0))
-    IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm")
+    kernel = tvm.testing.parameter((1, 1), (3, 3))
+    stride = tvm.testing.parameter((1, 1))
+    dilation = tvm.testing.parameter((1, 1))
+    padding = tvm.testing.parameter((0, 0, 0, 0))
+    io_tensor_mem_scope = tvm.testing.parameter("global.vtcm")
 
     @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
     @tvm.testing.requires_hexagon
     def test_maxpool2d_nhwc(
         self,
-        N,
-        H,
-        W,
-        C,
-        DTYPE,
-        KERNEL,
-        STRIDE,
-        DILATION,
-        PADDING,
-        IO_TENSOR_MEM_SCOPE,
+        n_batch,
+        height,
+        width,
+        channel,
+        dtype,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        io_tensor_mem_scope,
         hexagon_session: Session,
     ):
+        """Test maxpool2d NHWC"""
+
         keys_dict = {
             "basic_kernel": "max_pool2d",
             "sched_type": 1,
-            "input_shape_4d": [N, H, W, C],
+            "input_shape_4d": [n_batch, height, width, channel],
             "block_shape": [8, 8, 32],
-            "DTYPE": DTYPE,
-            "KERNEL": KERNEL,
-            "STRIDE": STRIDE,
-            "DILATION": DILATION,
-            "PADDING": PADDING,
-            "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE,
+            "dtype": dtype,
+            "kernel": kernel,
+            "stride": stride,
+            "dilation": dilation,
+            "padding": padding,
+            "io_tensor_mem_scope": io_tensor_mem_scope,
         }
 
         desc = bu.get_benchmark_decription(keys_dict)
@@ -229,13 +237,13 @@ def test_maxpool2d_nhwc(
             log_file.write(f"CONFIGURATION: {desc}\n")
 
             try:
-                input_tensor_shape_4d = [N, H, W, C]
-                input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C)
+                input_tensor_shape_4d = [n_batch, height, width, channel]
+                input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(n_batch, height, width, channel)
 
-                data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE)
+                data = te.placeholder(tuple(input_tensor_shape_4d), dtype=dtype)
 
                 output = topi.nn.pool2d(
-                    data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC"
+                    data, kernel, stride, dilation, padding, "max", layout="NHWC"
                 )
                 primfunc = te.create_prim_func([data, output])
 
@@ -262,20 +270,21 @@ def test_maxpool2d_nhwc(
                 # Note that we'll eventually need it in two different layouts:
                 # (1) NHWC as an argument to testing.poolnd_python.
                 # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc.
-                # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8)
-                a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE)
+                # a_numpy_4d = np.random.randint(low=-128, high=127,
+                #   size=input_tensor_shape_4d, dtype=np.int8)
+                a_numpy_4d = _create_test_input(input_tensor_shape_4d, dtype)
 
                 ref_output_4d = testing.poolnd_python(
                     a_numpy_4d.astype("int32"),
-                    KERNEL,
-                    STRIDE,
-                    DILATION,
-                    PADDING[0:2],
-                    PADDING[2:],
+                    kernel,
+                    stride,
+                    dilation,
+                    padding[0:2],
+                    padding[2:],
                     pool_type="max",
                     dtype="int32",
                     layout="NHWC",
-                ).astype(DTYPE)
+                ).astype(dtype)
 
                 output_tensor_shape_4d = ref_output_4d.shape
 
@@ -285,28 +294,25 @@ def test_maxpool2d_nhwc(
                     hexagon_session.device,
                     tensor_shape=input_tensor_shape_7d,
                     axis_separators=[4],
-                    dtype=DTYPE,
-                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                    dtype=dtype,
+                    mem_scope=io_tensor_mem_scope,
                 )
 
                 c_hexagon_4d = allocate_hexagon_array(
                     hexagon_session.device,
                     tensor_shape=output_tensor_shape_4d,
                     axis_separators=[],
-                    dtype=DTYPE,
-                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                    dtype=dtype,
+                    mem_scope=io_tensor_mem_scope,
                 )
 
                 a_hexagon_7d.copyfrom(a_numpy_7d)
 
-                if DTYPE == "int8":
+                if dtype == "int8":
                     rel_tolerance = 0
                     abs_tolerance = 0
                 else:
-                    assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}"
-
-                # hexagon_mod(a_hexagon_7d, c_hexagon_4d)
-                # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance)
+                    assert False, f"TODO: decide acceptable tolerances for dtype {dtype}"
 
                 timer = hexagon_mod.time_evaluator(
                     "main", hexagon_session.device, number=10, repeat=1
@@ -317,29 +323,29 @@ def test_maxpool2d_nhwc(
                     tvm.testing.assert_allclose(
                         ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance
                     )
-                except AssertionError as e:
-                    raise bu.NumericalAccuracyException(str(e))
+                except AssertionError as exception:
+                    raise bu.NumericalAccuracyException(str(exception))
 
-            except bu.NumericalAccuracyException as e:
+            except bu.NumericalAccuracyException as exception:
                 print()
                 print(f"FAIL: Numerical accuracy error. See log file.")
 
                 log_file.write("\n")
-                log_file.write(f"FAIL: {e}\n")
+                log_file.write(f"FAIL: {exception}\n")
 
                 self.benchmark_table.record_fail(
                     **keys_dict, comments=f"Numerical accuracy error. See log file."
                 )
 
-            except bu.UnsupportedException as e:
+            except bu.UnsupportedException as exception:
                 print()
-                print(f"SKIP: {e}")
+                print(f"SKIP: {exception}")
 
                 log_file.write("\n")
-                log_file.write(f"SKIP: {e}\n")
+                log_file.write(f"SKIP: {exception}\n")
 
                 self.benchmark_table.record_skip(
-                    **keys_dict, comments=f"Unsupported configuration: {e}"
+                    **keys_dict, comments=f"Unsupported configuration: {exception}"
                 )
 
             self.benchmark_table.record_success(timing_result, **keys_dict)
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py
index 5ec46cf4ae70..40edbda550b7 100644
--- a/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import math
-import struct
-import numpy as np
-import tvm.topi.hexagon.utils as utils
-
 """
 Test float to fixed-point conversion. We do it by constructing a numpy array with the
 wide range of floating-point values. These values are converted into the
@@ -29,9 +24,17 @@
 raised if they happened to be outside of the expected tolerance.
 """
 
+import math
+import struct
+import numpy as np
+import tvm.topi.hexagon.utils as utils
+
 
 class TestFixedPointConversion:
+    """Fixed point conversation test class"""
+
     def test_fixed_point_conversion(self):
+        """Test fixed point conversion"""
         # Construct array with wide range of values
         fp1 = np.random.uniform(0.00001, 0.0002, size=(10))
         fp2 = np.random.uniform(0.001, 0.02, size=(10))
diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
index 43feb827af42..5eac35f2d683 100644
--- a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
+++ b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test Fixed Point Multiply on Hexagon."""
+
 import re
 import numpy as np
 
@@ -80,91 +82,92 @@ def run_module(mod, inputs):
     return output
 
 
-in_scale_const, out_scale_const = tvm.testing.parameters(
-    (1.3, 30.0),
-    (1.37, 1.0),
-    (0.6, 1.0),
-    ((1.7, 0.6), 1.0),
-    ((0.007, 1.9), 1.0),
-)
-
-multiplier, shift = tvm.testing.parameters(
-    (1288490240, -2),  # 0.15
-    (1395864320, 1),  # 1.3
-    (1288490188, 0),  # 0.6
-)
-
-
-@tvm.testing.requires_hexagon
-def test_fixed_point_multiply(hexagon_session: Session, multiplier: int, shift: int):
-    ishape = (6, 32)
-    a = relay.var("a", relay.TensorType(ishape, "int32"))
-    fpm = relay.fixed_point_multiply(a, multiplier, shift)
-    relay_mod = tvm.IRModule.from_expr(fpm)
-
-    with tvm.transform.PassContext(opt_level=3):
-        # Compile for Hexagon...
-        hexagon_lowered = build_module(relay_mod, HEXAGON_AOT_LLVM_TARGET)
-
-        # Compile for LLVM...
-        llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
-
-    data_in = np.arange(-96, 96).reshape(ishape)
-    inputs = {"a": data_in}
-
-    # Run hexagon...
-    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = run_module(hexagon_mod, inputs)
+class TestFixedPointMultiply:
+    """Fixed point Multiply test class"""
 
-    # Run llvm...
-    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-    expected_output = run_module(llvm_mod, inputs)
-
-    tvm.testing.assert_allclose(hexagon_output, expected_output)
-
-
-@tvm.testing.requires_hexagon
-def test_per_channel_fixed_point_multiply(
-    hexagon_session: Session, in_scale_const, out_scale_const
-):
-    ishape = [1, 128, 56, 56]
-    axis = 1
-    a = relay.var("a", shape=ishape, dtype="int32")
-
-    # Make list of input scales from in_scale_const parameter.
-    if isinstance(in_scale_const, tuple):
-        in_scale = list(in_scale_const) * (ishape[axis] // len(in_scale_const))
-    else:
-        in_scale = [in_scale_const] * ishape[axis]
-    assert len(in_scale) == ishape[axis]
-
-    # qnn.requantize is lowered to fixed_point_multiply if zp == 0 and in_dtype == out_dtype.
-    iscale = relay.const(in_scale)
-    izero = relay.const(0)
-    oscale = relay.const(out_scale_const)
-    ozero = relay.const(0)
-    op = relay.qnn.op.requantize(a, iscale, izero, oscale, ozero, axis=axis, out_dtype="int32")
-    mod = tvm.IRModule.from_expr(op)
-
-    with tvm.transform.PassContext(opt_level=3):
-        # Compile for Hexagon...
-        hexagon_lowered = build_module(mod, HEXAGON_AOT_LLVM_TARGET)
-
-        # Compile for LLVM...
-        llvm_lowered = build_module(mod, tvm.target.Target("llvm"))
-
-    a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape)
-    inputs = {"a": a_np}
-
-    # Run hexagon...
-    hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    hexagon_output = run_module(hexagon_mod, inputs)
+    in_scale_const, out_scale_const = tvm.testing.parameters(
+        (1.3, 30.0),
+        (1.37, 1.0),
+        (0.6, 1.0),
+        ((1.7, 0.6), 1.0),
+        ((0.007, 1.9), 1.0),
+    )
 
-    # Run llvm...
-    llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
-    expected_output = run_module(llvm_mod, inputs)
+    multiplier, shift = tvm.testing.parameters(
+        (1288490240, -2),  # 0.15
+        (1395864320, 1),  # 1.3
+        (1288490188, 0),  # 0.6
+    )
 
-    tvm.testing.assert_allclose(hexagon_output, expected_output)
+    @tvm.testing.requires_hexagon
+    def test_fixed_point_multiply(self, hexagon_session: Session, multiplier: int, shift: int):
+        """Fixed point multiply test."""
+        ishape = (6, 32)
+        a = relay.var("a", relay.TensorType(ishape, "int32"))
+        fpm = relay.fixed_point_multiply(a, multiplier, shift)
+        relay_mod = tvm.IRModule.from_expr(fpm)
+
+        with tvm.transform.PassContext(opt_level=3):
+            # Compile for Hexagon...
+            hexagon_lowered = build_module(relay_mod, HEXAGON_AOT_LLVM_TARGET)
+
+            # Compile for LLVM...
+            llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm"))
+
+        data_in = np.arange(-96, 96).reshape(ishape)
+        inputs = {"a": data_in}
+
+        # Run hexagon...
+        hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = run_module(hexagon_mod, inputs)
+
+        # Run llvm...
+        llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
+        expected_output = run_module(llvm_mod, inputs)
+
+        tvm.testing.assert_allclose(hexagon_output, expected_output)
+
+    @tvm.testing.requires_hexagon
+    def test_per_channel(self, hexagon_session: Session, in_scale_const, out_scale_const):
+        """Per channel multiply test."""
+        ishape = [1, 128, 56, 56]
+        axis = 1
+        a = relay.var("a", shape=ishape, dtype="int32")
+
+        # Make list of input scales from in_scale_const parameter.
+        if isinstance(in_scale_const, tuple):
+            in_scale = list(in_scale_const) * (ishape[axis] // len(in_scale_const))
+        else:
+            in_scale = [in_scale_const] * ishape[axis]
+        assert len(in_scale) == ishape[axis]
+
+        # qnn.requantize is lowered to fixed_point_multiply if zp == 0 and in_dtype == out_dtype.
+        iscale = relay.const(in_scale)
+        izero = relay.const(0)
+        oscale = relay.const(out_scale_const)
+        ozero = relay.const(0)
+        op = relay.qnn.op.requantize(a, iscale, izero, oscale, ozero, axis=axis, out_dtype="int32")
+        mod = tvm.IRModule.from_expr(op)
+
+        with tvm.transform.PassContext(opt_level=3):
+            # Compile for Hexagon...
+            hexagon_lowered = build_module(mod, HEXAGON_AOT_LLVM_TARGET)
+
+            # Compile for LLVM...
+            llvm_lowered = build_module(mod, tvm.target.Target("llvm"))
+
+        a_np = np.random.randint(-1000, 1000, size=np.prod(ishape)).reshape(ishape)
+        inputs = {"a": a_np}
+
+        # Run hexagon...
+        hexagon_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = run_module(hexagon_mod, inputs)
+
+        # Run llvm...
+        llvm_mod = tvm.runtime.executor.AotModule(llvm_lowered["default"](tvm.cpu(0)))
+        expected_output = run_module(llvm_mod, inputs)
+
+        tvm.testing.assert_allclose(hexagon_output, expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_hexagon/test_memory_alloc.py b/tests/python/contrib/test_hexagon/test_memory_alloc.py
index a6d011eddd5a..f44e3cd0dc36 100644
--- a/tests/python/contrib/test_hexagon/test_memory_alloc.py
+++ b/tests/python/contrib/test_hexagon/test_memory_alloc.py
@@ -14,8 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import os.path
+"""Test memory allocation."""
 
 import numpy as np
 
@@ -25,23 +24,25 @@
 from .infrastructure import allocate_hexagon_array, get_hexagon_target
 
 
-@tvm.testing.fixture
-def generated_func(shape, scope, dtype, axis_separators):
+def generated_func(shape: tuple, dtype: str, axis_separators: list):
+    """Generate element wise function."""
     dim0, dim1 = shape
 
     @T.prim_func
     def elwise(a: T.handle, b: T.handle):
-        A = T.match_buffer(a, shape, dtype=dtype, axis_separators=axis_separators)
-        B = T.match_buffer(b, shape, dtype=dtype, axis_separators=axis_separators)
+        a_buffer = T.match_buffer(a, shape, dtype=dtype, axis_separators=axis_separators)
+        b_buffer = T.match_buffer(b, shape, dtype=dtype, axis_separators=axis_separators)
 
         for i, j in T.grid(dim0, dim1):
             with T.block("compute"):
-                B[i, j] = A[i, j] * T.cast(2, dtype=dtype)
+                b_buffer[i, j] = a_buffer[i, j] * T.cast(2, dtype=dtype)
 
     return elwise
 
 
 class TestMemoryAlloc:
+    """Memory allocation test."""
+
     dtype = tvm.testing.parameter("int8")
     shape = tvm.testing.parameter((128, 128))
 
@@ -53,11 +54,10 @@ class TestMemoryAlloc:
         ("global.ddr", [1]),
     )
 
-    def test_global_axis_separator(
-        self, hexagon_session, generated_func, shape, dtype, scope, axis_separators
-    ):
+    def test_global_axis_separator(self, hexagon_session, shape, dtype, scope, axis_separators):
+        """Test with global axis separator."""
         mod1 = tvm.build(
-            generated_func,
+            generated_func(shape, dtype, axis_separators),
             target=get_hexagon_target("v69"),
         )
         mod2 = hexagon_session.load_module(mod1)
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index a7f4cbc39cb1..9f4fd0768c50 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -44,24 +44,31 @@
 
 @tvm.script.ir_module
 class MatmulModule:
+    """Matmultest class"""
+
     @T.prim_func
     def main(  # type: ignore  # pylint: disable=no-self-argument
         a: T.handle, b: T.handle, c: T.handle
     ) -> None:
+        """main function"""
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, (16, 16), "float32")
-        B = T.match_buffer(b, (16, 16), "float32")
-        C = T.match_buffer(c, (16, 16), "float32")
+        a_buffer = T.match_buffer(a, (16, 16), "float32")
+        b_buffer = T.match_buffer(b, (16, 16), "float32")
+        c_buffer = T.match_buffer(c, (16, 16), "float32")
         for i, j, k in T.grid(16, 16, 16):
             with T.block("matmul"):
-                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                vi_axis, vj_axis, vk_axis = T.axis.remap("SSR", [i, j, k])
                 with T.init():
-                    C[vi, vj] = 0.0  # type: ignore
-                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+                    c_buffer[vi_axis, vj_axis] = 0.0  # type: ignore
+                c_buffer[vi_axis, vj_axis] = (
+                    c_buffer[vi_axis, vj_axis]
+                    + a_buffer[vi_axis, vk_axis] * b_buffer[vk_axis, vj_axis]
+                )
 
 
 @tvm.testing.requires_hexagon
 def test_builder_runner(hexagon_launcher):
+    """Test builder and runner."""
     if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
@@ -96,33 +103,35 @@ def test_builder_runner(hexagon_launcher):
         assert result >= 0.0
 
 
-def dense(m, n, k):
+def dense_compute(m, n, k):
+    """dense compute"""
     X = te.placeholder((m, k), name="X", dtype="uint8")
-    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+    packed_width = te.placeholder((n // 32, k // 4, 32, 4), name="packed_width", dtype="uint8")
 
-    ak = te.reduce_axis((0, k), name="k")
+    axis_k = te.reduce_axis((0, k), name="k")
     out = te.compute(
         (m, n),
         lambda i, j: te.sum(
-            X[i, ak].astype("int32")
-            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
-                "int32"
-            ),
-            axis=ak,
+            X[i, axis_k].astype("int32")
+            * packed_width[
+                tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(axis_k, 4), j % 32, axis_k % 4
+            ].astype("int32"),
+            axis=axis_k,
         ),
         name="compute",
     )
-    return [X, packedW, out]
+    return [X, packed_width, out]
 
 
-def schedule_dense(sch, block, M, do_tune):
+def schedule_dense(sch, block, m_size, do_tune):
+    """dense schedule"""
     a_y, a_x, _ = sch.get_loops(block)[-3:]
 
     if do_tune:
         y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
         a_yo, a_yi = sch.split(a_y, factors=y_factors)
     else:
-        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(m_size, 32)])
 
     a_xo, a_xi = sch.split(a_x, factors=[None, 32])
     sch.reorder(a_yo, a_xo, a_yi, a_xi)
@@ -143,51 +152,55 @@ def schedule_dense(sch, block, M, do_tune):
     sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
 
 
-def verify_dense(sch, target, M, N, K, hexagon_session):
+def verify_dense(sch, target, m_size, n_size, k_size, hexagon_session):
+    """Verify dense operator."""
     f = tvm.build(sch.mod["main"], target=target, name="dense")
     mod = hexagon_session.load_module(f)
     dev = hexagon_session.device
 
-    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
-    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    a_np = np.random.uniform(1, 10, size=(m_size, k_size)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(n_size, k_size)).astype("uint8")
     c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
 
-    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+    pack_width = np.random.uniform(1, 10, size=(n_size // 32, (k_size // 4), 32, 4)).astype("uint8")
 
-    for r_idx in range(N // 32):
-        for ko in range(K // 4):
+    for r_idx in range(n_size // 32):
+        for k_output in range(k_size // 4):
             for s_idx in range(32):
                 for t_idx in range(4):
-                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+                    pack_width[r_idx][k_output][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][
+                        k_output * 4 + t_idx
+                    ]
 
     a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(packW, dev)
-    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+    b = tvm.nd.array(pack_width, dev)
+    c = tvm.nd.array(np.zeros((m_size, n_size), dtype="int32"), dev)
 
     mod(a, b, c)
     np.testing.assert_equal(c.numpy(), c_np)
 
     evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
-    gflops = (N * M * K) * 2 / 1e9
+    gflops = (n_size * m_size * k_size) * 2 / 1e9
     time_ms = evaluator(a, b, c).mean * 1e3
     print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
 
 
 @tvm.testing.requires_hexagon
 def test_vrmpy_dense(hexagon_launcher):
+    """Test vector reduce muliply dense."""
     if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
     do_tune = True
 
-    M, N, K = 128, 768, 768
-    workload = te.create_prim_func(dense(M, N, K))
+    m_size, n_size, k_size = 128, 768, 768
+    workload = te.create_prim_func(dense_compute(m_size, n_size, k_size))
 
     if not do_tune:
         ir_module = tvm.IRModule({"main": workload})
         sch = tvm.tir.Schedule(ir_module)
         block = sch.get_block("compute")
-        schedule_dense(sch, block, M, do_tune)
+        schedule_dense(sch, block, m_size, do_tune)
     else:
         with tempfile.TemporaryDirectory() as work_dir:
 
@@ -214,19 +227,23 @@ def schedule_dense_for_tune(sch):
             sch = ms.tir_integration.compile_tir(database, workload, target)
 
     with hexagon_launcher.create_session() as session:
-        verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
+        verify_dense(sch, get_hexagon_target("v68"), m_size, n_size, k_size, session)
 
 
 # This is an example of a schedule found by vrmpy auto tensorization.
 # It gets 440 GFLOPS on SD888.
 @tvm.script.ir_module
-class Module_vrmpy_auto_tensorize:
+class ModuleVRMPYAutoTensorize:
+    """Vector Reduce Multimply auto tensorize test class."""
+
+    # pylint: disable=no-self-argument
     @T.prim_func
     def main(  # type: ignore
         X: T.Buffer[(128, 768), "uint8"],  # type: ignore
-        packedW: T.Buffer[(24, 192, 32, 4), "uint8"],  # type: ignore
+        packed_width: T.Buffer[(24, 192, 32, 4), "uint8"],  # type: ignore
         compute: T.Buffer[(128, 768), "int32"],  # type: ignore
     ) -> None:
+        """Main function."""
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         for i0_0_i1_0_0_fused in T.parallel(
             512, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}
@@ -251,33 +268,42 @@ def main(  # type: ignore
                     T.reads(
                         compute[i, j_o * 32 : j_o * 32 + 32],  # type: ignore
                         X[i, k_o * 4 : k_o * 4 + 4],  # type: ignore
-                        packedW[j_o, k_o, 0:32, 0:4],  # type: ignore
+                        packed_width[j_o, k_o, 0:32, 0:4],  # type: ignore
                     )
                     T.writes(compute[i, j_o * 32 : j_o * 32 + 32])  # type: ignore
-                    A = T.match_buffer(
-                        X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1  # type: ignore
+                    a_buffer = T.match_buffer(
+                        X[i, k_o * 4 : k_o * 4 + 4],
+                        [4],
+                        dtype="uint8",
+                        offset_factor=1,  # type: ignore
                     )
-                    B = T.match_buffer(
-                        packedW[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1
+                    b_buffer = T.match_buffer(
+                        packed_width[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1
                     )
-                    C = T.match_buffer(
-                        compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1  # type: ignore
+                    c_buffer = T.match_buffer(
+                        compute[i, j_o * 32 : j_o * 32 + 32],
+                        [32],
+                        dtype="int32",
+                        offset_factor=1,  # type: ignore
                     )
-                    A_u8x4: T.uint8x4 = A[0:4]  # type: ignore
-                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")  # type: ignore
-                    B_i32x32: T.int32x32 = T.reinterpret(B[0, 0:128], dtype="int32x32")  # type: ignore
-                    C[0:32] = T.call_llvm_pure_intrin(  # type: ignore
-                        4390, T.uint32(3), C[0:32], B_i32x32, A_i32, dtype="int32x32"
+                    a_u8x4: T.uint8x4 = a_buffer[0:4]  # type: ignore
+                    a_i32: T.int32 = T.reinterpret(a_u8x4, dtype="int32")  # type: ignore
+                    b_i32x32: T.int32x32 = T.reinterpret(
+                        b_buffer[0, 0:128], dtype="int32x32"
+                    )  # type: ignore
+                    c_buffer[0:32] = T.call_llvm_pure_intrin(  # type: ignore
+                        4390, T.uint32(3), C[0:32], b_i32x32, a_i32, dtype="int32x32"
                     )
 
 
 @tvm.testing.requires_hexagon
 def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
+    """Test VRMPY dense operator."""
     if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
-    M, N, K = 128, 768, 768
-    workload = te.create_prim_func(dense(M, N, K))
+    m_size, n_size, k_size = 128, 768, 768
+    workload = te.create_prim_func(dense_compute(m_size, n_size, k_size))
 
     sch_rules = [
         schedule_rule.MultiLevelTilingWithIntrin(
@@ -308,7 +334,8 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
     ]
 
     # Make this to False to compile and run the best tuned schedule
-    if True:
+    run_tuning = True
+    if run_tuning:
         with tempfile.TemporaryDirectory() as work_dir:
             target = get_hexagon_target("v68")
             database = ms.tir_integration.tune_tir(
@@ -328,25 +355,26 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher):
             )
             sch = ms.tir_integration.compile_tir(database, workload, target)
     else:
-        sch = tvm.tir.Schedule(Module_vrmpy_auto_tensorize, debug_mask="all")
+        sch = tvm.tir.Schedule(ModuleVRMPYAutoTensorize, debug_mask="all")
 
     with hexagon_launcher.create_session() as session:
-        verify_dense(sch, get_hexagon_target("v68"), M, N, K, session)
+        verify_dense(sch, get_hexagon_target("v68"), m_size, n_size, k_size, session)
 
 
 @tvm.testing.requires_hexagon
 def test_conv2d_relay_auto_schedule(hexagon_launcher):
+    """Test conv2d using auto schedule."""
     if hexagon_launcher.is_simulator():
         pytest.skip(msg="Tuning on simulator not supported.")
 
-    I, O, H, W = 64, 64, 56, 56
-    kH = kW = 3
+    i_size, o_size, h_size, w_size = 64, 64, 56, 56
+    k_height_size = k_width_size = 3
 
     strides = (1, 1)
     padding = (1, 1)
 
-    d_shape = (1, H, W, I)
-    w_shape = (kH, kW, I, O)
+    d_shape = (1, h_size, w_size, i_size)
+    w_shape = (k_height_size, k_width_size, i_size, o_size)
     bias_shape = (1, 1, 1, w_shape[3])
     out_channel = w_shape[3]
 
@@ -356,7 +384,7 @@ def test_conv2d_relay_auto_schedule(hexagon_launcher):
     conv2d = relay.nn.conv2d(
         data=data,
         weight=weight,
-        kernel_size=(kH, kW),
+        kernel_size=(k_height_size, k_width_size),
         channels=out_channel,
         padding=padding,
         strides=strides,
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
index 046f949a761f..15273afdd41e 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
@@ -26,7 +26,11 @@
 
 from .infrastructure import get_hexagon_target
 
-TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n    -Single Thread: {} ms \n    -Parallel: {} ms\n    -Speedup: {}x\n"
+TEST_OUTPUT_TEMPLATE = (
+    "Test {} with {} operations... \n"
+    "    -Single Thread: {} ms \n"
+    "    -Parallel: {} ms\n    -Speedup: {}x\n"
+)
 
 
 def get_vrmpy_shape_dtypes(operations):
@@ -61,28 +65,30 @@ def vrmpy_expected_producer(shape, a, b):
     expected = np.zeros(shape, dtype="int32")
     for n in range(shape[0]):
         for i in range(32):
-            for r in range(4):
-                expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r]) * np.uint32(
-                    b[n, i * 4 + r]
+            for r_ind in range(4):
+                expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r_ind]) * np.uint32(
+                    b[n, i * 4 + r_ind]
                 )
     return expected
 
 
 def get_vmpy_operator(operations):
+    """Generate vector multiply operator"""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8")
-        C = T.match_buffer(c, [operations, 128], dtype="int16")
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
+        c_buffer = T.match_buffer(c, [operations, 128], dtype="int16")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vmpybusv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int16x128",
                 )
 
@@ -90,20 +96,22 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def get_vadd_operator(operations):
+    """Generate vadd operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8")
-        C = T.match_buffer(c, [operations, 128], dtype="int16")
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
+        c_buffer = T.match_buffer(c, [operations, 128], dtype="int16")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vaddubh.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int16x128",
                 )
 
@@ -111,20 +119,22 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def get_vrmpy_operator(operations):
+    """Generate vrmpy operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8")
-        C = T.match_buffer(c, [operations, 32], dtype="int32")
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )
 
@@ -132,6 +142,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
+    """Evaluate schedule."""
     a_shape, a_dtype, b_shape, b_dtype, c_shape, c_dtype = shape_dtypes
 
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
@@ -160,6 +171,7 @@ def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
 
 
 class TestMatMulVec:
+    """MatMul test class."""
 
     (
         operation_name,
@@ -182,9 +194,11 @@ class TestMatMulVec:
         128,
         # 256,
         # 512,
-        # 1024,  # Single thread runs faster since L2 cache can handle the entire request quickly
+        # Single thread runs faster since L2 cache can handle the entire request quickly
+        # 1024,
         # 2048,
-        # 4096,  # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # 4096,
         # 8192,
         # 16384,
     )
@@ -200,6 +214,7 @@ def test(
         expected_output_producer,
         split_factor,
     ):
+        """Test function handler."""
 
         sch = tvm.tir.Schedule(operator_producer(operation_count))
         single_thread_runtime = evaluate(
@@ -207,10 +222,10 @@ def test(
         )
 
         sch = tvm.tir.Schedule(operator_producer(operation_count))
-        block = sch.get_block("C")
+        block = sch.get_block("c_buffer")
         b = sch.get_loops(block)
-        bo, _ = sch.split(b[0], factors=[split_factor, None])
-        sch.parallel(bo)
+        b_output, _ = sch.split(b[0], factors=[split_factor, None])
+        sch.parallel(b_output)
 
         parallel_runtime = evaluate(
             hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index 6e43298a4eb5..f720f67ea354 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -25,14 +25,24 @@
 
 from .infrastructure import get_hexagon_target
 
-TEST_OUTPUT_TEMPLATE = "Test with {} MB of data to load... \n    -No VTCM: {} Gops \n    -Basic VTCM: {} Gops \n    -Vectorized: {} Gops\n    -Vectorized and Parallelized: {} Gops\n    -Preallocated and Vectorized: {} Gops\n    -Preallocated, Vectorized, and Parallelized: {} Gops\n    -Single DMA: {} Gops\n    -Preloaded: {} Gops\n"
+TEST_OUTPUT_TEMPLATE = (
+    "Test with {} MB of data to load... \n"
+    "    -No VTCM: {} Gops \n    -Basic VTCM: {} Gops \n"
+    "    -Vectorized: {} Gops\n    -Vectorized and"
+    " Parallelized: {} Gops\n    -Preallocated and Vectorized: {} Gops\n"
+    "    -Preallocated, Vectorized, and Parallelized: {} Gops\n"
+    "    -Single DMA: {} Gops\n    -Preloaded: {} Gops\n"
+)
 
 
 def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vector_split):
+    """Apply parallel unroll vectorized."""
     for block in blocks:
-        vb, vi = sch.get_loops(block)
-        v = sch.fuse(vb, vi)
-        vbo, vbi, vio, vii = sch.split(v, factors=[outer_split, None, unroll_split, vector_split])
+        vb_index, vi_index = sch.get_loops(block)
+        v = sch.fuse(vb_index, vi_index)
+        vbo, vbi, vio, vii = sch.split(  # pylint: disable=unused-variable
+            v, factors=[outer_split, None, unroll_split, vector_split]
+        )  # pylint: disable=unused-variable
         sch.vectorize(vii)
         sch.unroll(vio)
         sch.parallel(vbo)
@@ -41,8 +51,8 @@ def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vect
 
 def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split):
     for block in blocks:
-        vb, vi = sch.get_loops(block)
-        v = sch.fuse(vb, vi)
+        vb_index, vi_index = sch.get_loops(block)
+        v = sch.fuse(vb_index, vi_index)
         _, vio, vii = sch.split(v, factors=[None, unroll_split, vector_split])
         sch.vectorize(vii)
         sch.unroll(vio)
@@ -50,15 +60,15 @@ def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split):
 
 
 def apply_vrmpy_parallelization(sch):
-    block = sch.get_block("C")
+    block = sch.get_block("c_buffer")
     b = sch.get_loops(block)
-    bo, _ = sch.split(b[0], factors=[4, None])
-    sch.parallel(bo)
+    b_outer, _ = sch.split(b[0], factors=[4, None])
+    sch.parallel(b_outer)
     return sch
 
 
 def apply_vtcm_cache_read_write(sch):
-    block = sch.get_block("C")
+    block = sch.get_block("c_buffer")
     sch.cache_read(block, 0, "global.vtcm")
     sch.cache_read(block, 1, "global.vtcm")
     sch.cache_write(block, 0, "global.vtcm")
@@ -66,20 +76,22 @@ def apply_vtcm_cache_read_write(sch):
 
 
 def vrmpy(operations):
+    """Generate VRMPY operator"""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128)
-        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128)
-        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128)
+        a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8", align=128)
+        b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8", align=128)
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128)
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )
 
@@ -87,34 +99,40 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def preloaded_vrmpy(operations):
+    """Generate preloaded VRMPY operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(
+        a_buffer = T.match_buffer(
             a,
             [T.cast(operations, "int32") * 128],
             dtype="uint8",
             align=128,
             mem_scope="global.vtcm",
         )
-        B = T.match_buffer(
+        b_buffer = T.match_buffer(
             b,
             [T.cast(operations, "int32") * 128],
             dtype="uint8",
             align=128,
             mem_scope="global.vtcm",
         )
-        C = T.match_buffer(
+        c_buffer = T.match_buffer(
             c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, mem_scope="global.vtcm"
         )
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
-                    T.reinterpret(A[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"),
-                    T.reinterpret(B[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(
+                        a_buffer[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    T.reinterpret(
+                        b_buffer[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
                     dtype="int32x32",
                 )
 
@@ -122,6 +140,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def preallocated_vrmpy(operations):
+    """Generate preallocated VRMPY operator."""
     size = operations * 128
     out_size = operations * 32
 
@@ -130,49 +149,56 @@ def operator(
         a: T.handle, b: T.handle, c: T.handle, a_v: T.handle, b_v: T.handle, c_v: T.handle
     ) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global")
-        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
-        A_global_vtcm = T.match_buffer(
+        a_buffer = T.match_buffer(
+            a, [operations, 128], dtype="uint8", align=128, mem_scope="global"
+        )
+        b_buffer = T.match_buffer(
+            b, [operations, 128], dtype="uint8", align=128, mem_scope="global"
+        )
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
+        a_global_vtcm = T.match_buffer(
             a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
         )
-        B_global_vtcm = T.match_buffer(
+        b_global_vtcm = T.match_buffer(
             b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
         )
-        C_global_vtcm = T.match_buffer(
+        c_global_vtcm = T.match_buffer(
             c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
         )
         for n, i in T.grid(operations, 128):
             with T.block("A_global.vtcm"):
-                vn, vi = T.axis.remap("SS", [n, i])
-                A_global_vtcm[vn * 128 + vi] = A[vn, vi]
+                vn_ind, vi_index = T.axis.remap("SS", [n, i])
+                a_global_vtcm[vn_ind * 128 + vi_index] = a_buffer[vn_ind, vi_index]
         for n, i in T.grid(operations, 128):
             with T.block("B_global.vtcm"):
-                vn, vi = T.axis.remap("SS", [n, i])
-                B_global_vtcm[vn * 128 + vi] = B[vn, vi]
+                vn_ind, vi_index = T.axis.remap("SS", [n, i])
+                b_global_vtcm[vn_ind * 128 + vi_index] = b_buffer[vn_ind, vi_index]
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
                     T.reinterpret(
-                        A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                        a_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)],
+                        dtype="int32x32",
                     ),
                     T.reinterpret(
-                        B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                        b_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)],
+                        dtype="int32x32",
                     ),
                     dtype="int32x32",
                 )
         for n, i in T.grid(operations, 32):
             with T.block("C_global.vtcm"):
-                vn, vi = T.axis.remap("SS", [n, i])
-                C[vn, vi] = C_global_vtcm[vn * 32 + vi]
+                vn_ind, vi_index = T.axis.remap("SS", [n, i])
+                c_buffer[vn_ind, vi_index] = c_global_vtcm[vn_ind * 32 + vi_index]
 
     return operator
 
 
 def preallocated_single_dma_vrmpy(operations):
+    """Generate preallocated single DMA VRMPY operator."""
     size = operations * 128
     out_size = operations * 32
 
@@ -186,36 +212,40 @@ def operator(
         c_v: T.handle,
     ) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global")
-        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global")
-        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
-        A_global_vtcm = T.match_buffer(
+        a_buffer = T.match_buffer(
+            a, [operations, 128], dtype="uint8", align=128, mem_scope="global"
+        )
+        b_buffer = T.match_buffer(
+            b, [operations, 128], dtype="uint8", align=128, mem_scope="global"
+        )
+        c_buffer = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
+        a_global_vtcm = T.match_buffer(
             a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
         )
-        B_global_vtcm = T.match_buffer(
+        b_global_vtcm = T.match_buffer(
             b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
         )
-        C_global_vtcm = T.match_buffer(
+        c_global_vtcm = T.match_buffer(
             c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
         )
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    A_global_vtcm.data,
+                    a_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    A_global_vtcm.dtype,
+                    a_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    A.data,
+                    a_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    A.dtype,
+                    a_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -227,20 +257,20 @@ def operator(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    B_global_vtcm.data,
+                    b_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    B_global_vtcm.dtype,
+                    b_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    B.data,
+                    b_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    B.dtype,
+                    b_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -249,16 +279,18 @@ def operator(
             )
         )
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.uint32(2),
                     T.reinterpret(
-                        A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                        a_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)],
+                        dtype="int32x32",
                     ),
                     T.reinterpret(
-                        B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                        b_global_vtcm[T.ramp(T.cast(vn_ind, "int32") * 128, 1, 128)],
+                        dtype="int32x32",
                     ),
                     dtype="int32x32",
                 )
@@ -266,20 +298,20 @@ def operator(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    C.data,
+                    c_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    C.dtype,
+                    c_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    C_global_vtcm.data,
+                    c_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    C_global_vtcm.dtype,
+                    c_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -301,6 +333,7 @@ def evaluate_result(operations, tag, time, result, expected_output):
 
 
 def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global"):
+    """Setup and run operator."""
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
@@ -321,6 +354,7 @@ def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global")
 
 
 def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
+    """Setup and run for preallocated."""
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
@@ -353,34 +387,9 @@ def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
     return gops, c_hexagon.asnumpy()
 
 
-@tvm.testing.fixture
-def input_a(operations):
-    return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
-
-
-@tvm.testing.fixture
-def input_b(operations):
-    return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
-
-
-@tvm.testing.fixture
-def input_c(operations):
-    return np.zeros((operations, 32), dtype="int32")
-
-
-@tvm.testing.fixture
-def expected_output(operations, input_a, input_b, input_c):
-    expected_output = np.zeros(input_c.shape, dtype="int32")
-    for n in range(operations):
-        for i in range(32):
-            for r in range(4):
-                expected_output[n, i] = expected_output[n, i] + np.uint32(
-                    input_a[n, i * 4 + r]
-                ) * np.uint32(input_b[n, i * 4 + r])
-    return expected_output
-
-
 class TestMatMulVec:
+    """MatMul test class."""
+
     # Removed most of these to speedup CI.
     operations = tvm.testing.parameter(
         1024,
@@ -398,6 +407,29 @@ class TestMatMulVec:
     c_vector_split = tvm.testing.parameter(16)
     c_vector_split_unallocated = tvm.testing.parameter(8)
 
+    @tvm.testing.fixture
+    def input_a(self, operations):
+        return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+
+    @tvm.testing.fixture
+    def input_b(self, operations):
+        return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+
+    @tvm.testing.fixture
+    def input_c(self, operations):
+        return np.zeros((operations, 32), dtype="int32")
+
+    @tvm.testing.fixture
+    def expected_output(self, operations, input_a, input_b, input_c):
+        expected_output = np.zeros(input_c.shape, dtype="int32")
+        for n in range(operations):
+            for i in range(32):
+                for r_ind in range(4):  # pylint: disable=unused-variable
+                    expected_output[n, i] = expected_output[n, i] + np.uint32(
+                        input_a[n, i * 4 + r_ind]
+                    ) * np.uint32(input_b[n, i * 4 + r_ind])
+        return expected_output
+
     @tvm.testing.requires_hexagon
     def test_loading_vtcm_for_vrmpy(
         self,
@@ -413,7 +445,7 @@ def test_loading_vtcm_for_vrmpy(
         c_vector_split,
         c_vector_split_unallocated,
     ):
-
+        """Load VTCM for VRMPY operator test."""
         # Run parallel vrmpy without loading to VTCM.
         sch = tvm.tir.Schedule(vrmpy(operations))
         sch = apply_vrmpy_parallelization(sch)
@@ -503,7 +535,7 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_parallel_unroll_vectorize(
             sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split
         )
-        preallocated_vectorized_parallelized_runtime, result = setup_and_run_preallocated(
+        prealloc_vector_parallelized, result = setup_and_run_preallocated(
             hexagon_session, sch, input_a, input_b, input_c, operations
         )
         result = result.reshape((operations, 32))
@@ -539,7 +571,7 @@ def test_loading_vtcm_for_vrmpy(
                 vectorized_runtime,
                 vectorized_parallelized_runtime,
                 preallocated_vectorized_runtime,
-                preallocated_vectorized_parallelized_runtime,
+                prealloc_vector_parallelized,
                 single_dma_runtime,
                 preloaded_runtime,
             )
diff --git a/tests/python/contrib/test_hexagon/test_parallel_scalar.py b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
index fd3eef1b195b..b96265d9df99 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_scalar.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
@@ -25,55 +25,66 @@
 
 from .infrastructure import get_hexagon_target
 
-TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n    -Single Thread: {} ms \n    -Parallel: {} ms\n    -Speedup: {}x\n"
+TEST_OUTPUT_TEMPLATE = (
+    "Test {} with {} operations... \n"
+    "    -Single Thread: {} ms \n"
+    "    -Parallel: {} ms\n    -Speedup: {}x\n"
+)
 
 
 def get_add_operator(operations):
+    """Generate add operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations], dtype="float64")
-        B = T.match_buffer(b, [operations], dtype="float64")
-        C = T.match_buffer(c, [operations], dtype="float64")
+        a_buffer = T.match_buffer(a, [operations], dtype="float64")
+        b_buffer = T.match_buffer(b, [operations], dtype="float64")
+        c_buffer = T.match_buffer(c, [operations], dtype="float64")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn] = A[vn] + B[vn]
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind] = a_buffer[vn_ind] + b_buffer[vn_ind]
 
     return operator
 
 
 def get_multiply_operator(operations):
+    """Generate multiply operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations], dtype="float64")
-        B = T.match_buffer(b, [operations], dtype="float64")
-        C = T.match_buffer(c, [operations], dtype="float64")
+        a_buffer = T.match_buffer(a, [operations], dtype="float64")
+        b_buffer = T.match_buffer(b, [operations], dtype="float64")
+        c_buffer = T.match_buffer(c, [operations], dtype="float64")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn] = A[vn] * B[vn]
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind] = a_buffer[vn_ind] * b_buffer[vn_ind]
 
     return operator
 
 
 def get_sub_operator(operations):
+    """Generate subtract operator."""
+
     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A = T.match_buffer(a, [operations], dtype="float64")
-        B = T.match_buffer(b, [operations], dtype="float64")
-        C = T.match_buffer(c, [operations], dtype="float64")
+        a_buffer = T.match_buffer(a, [operations], dtype="float64")
+        b_buffer = T.match_buffer(b, [operations], dtype="float64")
+        c_buffer = T.match_buffer(c, [operations], dtype="float64")
         for n in T.grid(operations):
-            with T.block("C"):
-                vn = T.axis.remap("S", [n])
-                C[vn] = A[vn] - B[vn]
+            with T.block("c_buffer"):
+                vn_ind = T.axis.remap("S", [n])
+                c_buffer[vn_ind] = a_buffer[vn_ind] - b_buffer[vn_ind]
 
     return operator
 
 
 def evaluate(hexagon_session, operations, expected, sch):
+    """Evalute schedule."""
     shape = operations
     dtype = "float64"
 
@@ -104,6 +115,7 @@ def evaluate(hexagon_session, operations, expected, sch):
 
 
 class TestMatMulVec:
+    """MatMul test class."""
 
     (operation_name, operator_producer, expected_output_producer,) = tvm.testing.parameters(
         ("add", get_add_operator, (lambda a, b: a + b)),
@@ -116,9 +128,11 @@ class TestMatMulVec:
         128,
         # 256,
         # 512,
-        # 1024,  # Single thread runs faster since L2 cache can handle the entire request quickly
+        # Single thread runs faster since L2 cache can handle the entire request quickly
+        # 1024,
         # 2048,
-        # 4096,  # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # Significant performance degredation once the inputs and outputs cannot all fit in L2
+        # 4096,
         # 8192,
         # 16384,
     )
@@ -135,15 +149,16 @@ def test_add(
         operations,
         split_factor,
     ):
+        """Test Add operator."""
 
         sch = tvm.tir.Schedule(operator_producer(operations))
         single_thread_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch)
 
         sch = tvm.tir.Schedule(operator_producer(operations))
-        block = sch.get_block("C")
+        block = sch.get_block("c_buffer")
         b = sch.get_loops(block)
-        bo, _ = sch.split(b[0], factors=[split_factor, None])
-        sch.parallel(bo)
+        b_output, _ = sch.split(b[0], factors=[split_factor, None])
+        sch.parallel(b_output)
         parallel_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch)
 
         speedup = round(single_thread_runtime / parallel_runtime, 2)
diff --git a/tests/python/contrib/test_hexagon/test_sigmoid.py b/tests/python/contrib/test_hexagon/test_sigmoid.py
index 1ff5bf3db340..e115b188a3f0 100644
--- a/tests/python/contrib/test_hexagon/test_sigmoid.py
+++ b/tests/python/contrib/test_hexagon/test_sigmoid.py
@@ -14,26 +14,25 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Sigmoid operator tests."""
 
 import numpy as np
-import pytest
 
 import tvm
 import tvm.testing
 from tvm import te
 from tvm import tir
 from tvm import topi
-from tvm.contrib.hexagon.build import HexagonLauncher
 
 from .infrastructure import allocate_hexagon_array, get_hexagon_target
 
 
-def sigmoid_compute(Input):
-    return topi.sigmoid(Input)
+def sigmoid_compute(sigmoid_input):
+    return topi.sigmoid(sigmoid_input)
 
 
-def sigmoid_stir_schedule(Input, Output):
-    sigmoid_func = te.create_prim_func([Input, Output])
+def sigmoid_stir_schedule(sigmoid_input, sigmoid_output):
+    sigmoid_func = te.create_prim_func([sigmoid_input, sigmoid_output])
     sch = tir.Schedule(sigmoid_func, debug_mask="all")
     block = sch.get_block("compute")
 
@@ -42,17 +41,6 @@ def sigmoid_stir_schedule(Input, Output):
     return sch
 
 
-@tvm.testing.fixture
-def input_np(in_shape, dtype, min_val, max_val):
-    return np.random.uniform(low=min_val, high=max_val, size=in_shape).astype(dtype)
-
-
-@tvm.testing.fixture
-def ref_output_np(input_np):
-    output_np = 1 / (1 + np.exp(-input_np))
-    return output_np
-
-
 class BaseSigmoid:
     (in_shape, dtype, min_val, max_val,) = tvm.testing.parameters(
         ((64,), "float16", -8.0, 8.0),
@@ -64,6 +52,17 @@ class BaseSigmoid:
 
 
 class TestSigmoid(BaseSigmoid):
+    """Sigmoid test class."""
+
+    @tvm.testing.fixture
+    def input_np(self, in_shape, dtype, min_val, max_val):
+        return np.random.uniform(low=min_val, high=max_val, size=in_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def ref_output_np(self, input_np):
+        output_np = 1 / (1 + np.exp(-input_np))
+        return output_np
+
     @tvm.testing.requires_hexagon
     def test_sigmoid(
         self,
@@ -73,11 +72,12 @@ def test_sigmoid(
         ref_output_np,
         hexagon_session,
     ):
-        InputTensor = te.placeholder(in_shape, name="InputTensor", dtype=dtype)
+        """Sigmoid test."""
+        input_tensor = te.placeholder(in_shape, name="input_tensor", dtype=dtype)
 
-        OutputTensor = sigmoid_compute(InputTensor)
+        output_tensor = sigmoid_compute(input_tensor)
 
-        tir_s = sigmoid_stir_schedule(InputTensor, OutputTensor)
+        tir_s = sigmoid_stir_schedule(input_tensor, output_tensor)
 
         input_data = allocate_hexagon_array(
             hexagon_session.device,
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index f80a579f58fe..ba7513a4f39c 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -14,9 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Async software pipeline tests."""
 
-import sys
-import pytest
 import numpy as np
 
 import tvm
@@ -25,174 +24,178 @@
 
 from .infrastructure import get_hexagon_target
 
-outer = tvm.testing.parameter(8, 16)
-inner = tvm.testing.parameter(64, 128)
-dtype = tvm.testing.parameter("uint8", "float16")
-scope = tvm.testing.parameter("global", "global.vtcm")
-# TODO(Joseph) Turn on "multi_input_diffQ" compute type once we have upstreamed
-# changes in the InjectSoftwarePipeline pass to alleviate this restriction:
-# 'A dependency on multiple async stages is not supported'
-comp_type = tvm.testing.parameter("single_input", "multi_input_sameQ")
-# TODO(Straw) Add back "cache_write" schedule type once we have upstreamed
-# buffer dependency analysis in InjectSoftwarePipeline pass
-# to insert approprite TIR "wait" attributes for this schedule
-sched_type = tvm.testing.parameter("cache_read", "cache_read_write")
-
-
-@tvm.testing.fixture
-def data(comp_type, outer, inner, dtype):
-    out_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
-    a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
-    if comp_type == "single_input":
-        return out_np, a_np
-    else:
-        b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
-        return out_np, a_np, b_np
 
-
-@tvm.testing.fixture
 def compute(comp_type, outer, inner, dtype):
+    """Generate compute function."""
     if comp_type == "single_input":
 
         @T.prim_func
         def a_plus_1_primfunc(
-            A: T.Buffer[(outer, inner), dtype], OUT: T.Buffer[(outer, inner), dtype]
+            a_buffer: T.Buffer[(outer, inner), dtype], out: T.Buffer[(outer, inner), dtype]
         ):
             for i in T.serial(outer):
                 for j in T.serial(inner):
                     with T.block("compute"):
                         with T.block():
-                            OUT[i, j] = A[i, j] + T.cast(1, dtype)
+                            out[i, j] = a_buffer[i, j] + T.cast(1, dtype)
 
         return a_plus_1_primfunc
     else:
 
         @T.prim_func
         def a_plus_b_plus_1_primfunc(
-            A: T.Buffer[(outer, inner), dtype],
-            B: T.Buffer[(outer, inner), dtype],
-            OUT: T.Buffer[(outer, inner), dtype],
+            a_buffer: T.Buffer[(outer, inner), dtype],
+            b_buffer: T.Buffer[(outer, inner), dtype],
+            out: T.Buffer[(outer, inner), dtype],
         ):
             for i in T.serial(outer):
                 for j in T.serial(inner):
                     with T.block("compute"):
                         with T.block():
-                            OUT[i, j] = A[i, j] + B[i, j] + T.cast(1, dtype)
+                            out[i, j] = a_buffer[i, j] + b_buffer[i, j] + T.cast(1, dtype)
 
         return a_plus_b_plus_1_primfunc
 
 
-@tvm.testing.fixture
-def reference(comp_type):
-    if comp_type == "single_input":
-
-        def a_plus_1_ref(a):
-            return a + 1
-
-        return a_plus_1_ref
-    else:
-
-        def a_plus_b_plus_1_ref(a, b):
-            return a + b + 1
-
-        return a_plus_b_plus_1_ref
-
-
-@tvm.testing.fixture
-def schedule(comp_type, compute, sched_type, scope):
-    sch = tir.Schedule(compute)
-
-    compute_block = sch.get_block("compute")
-    i, _ = sch.get_loops(compute_block)
-
-    if "read" in sched_type:
-        cache_read_a = sch.cache_read(compute_block, 0, scope)
-        sch.compute_at(cache_read_a, i)
+class TestAsyncSoftwarePipeline:
+    """Async software pipeline test class."""
+
+    outer = tvm.testing.parameter(8, 16)
+    inner = tvm.testing.parameter(64, 128)
+    dtype = tvm.testing.parameter("uint8", "float16")
+    scope = tvm.testing.parameter("global", "global.vtcm")
+    # TODO(Joseph) Turn on "multi_input_diffQ" compute type once we have upstreamed
+    # changes in the InjectSoftwarePipeline pass to alleviate this restriction:
+    # 'a_buffer dependency on multiple async stages is not supported'
+    comp_type = tvm.testing.parameter("single_input", "multi_input_sameQ")
+    # TODO(Straw) Add back "cache_write" schedule type once we have upstreamed
+    # buffer dependency analysis in InjectSoftwarePipeline pass
+    # to insert approprite TIR "wait" attributes for this schedule
+    sched_type = tvm.testing.parameter("cache_read", "cache_read_write")
+
+    @tvm.testing.fixture
+    def data(self, comp_type, outer, inner, dtype):
+        out_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+        a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+        if comp_type == "single_input":
+            return out_np, a_np
+        else:
+            b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype)
+            return out_np, a_np, b_np
+
+    @tvm.testing.fixture
+    def verify(self, dtype):
+        def check(out, ref):
+            if "int" in dtype:
+                np.testing.assert_equal(out.numpy(), ref)
+            else:
+                np.testing.assert_allclose(out.numpy(), ref, rtol=1e-3, atol=1e-3)
+
+        return check
+
+    @tvm.testing.fixture
+    def reference(self, comp_type):
+        """Returns reference data."""
+        if comp_type == "single_input":
 
-        if "multi_input" in comp_type:
-            cache_read_b = sch.cache_read(compute_block, 1, scope)
-            sch.compute_at(cache_read_b, i)
+            def a_plus_1_ref(a):
+                return a + 1
 
-    if "write" in sched_type:
-        cache_write_out = sch.cache_write(compute_block, 0, scope)
-        sch.reverse_compute_at(cache_write_out, i)
+            return a_plus_1_ref
+        else:
 
-    if "read" in sched_type and "write" in sched_type:
-        if comp_type == "single_input":
-            sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_async_stages", [0, 2])
-        elif comp_type == "multi_input_sameQ":
-            sch.annotate(i, "software_pipeline_stage", [0, 0, 1, 2])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
-            sch.annotate(i, "software_pipeline_async_stages", [0, 2])
-        elif comp_type == "multi_input_diffQ":
-            sch.annotate(i, "software_pipeline_stage", [0, 1, 2, 3])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
-            sch.annotate(i, "software_pipeline_async_stages", [0, 1, 2])
-
-    elif "read" in sched_type:
-        if comp_type == "single_input":
+            def a_plus_b_plus_1_ref(a, b):
+                return a + b + 1
+
+            return a_plus_b_plus_1_ref
+
+    @tvm.testing.fixture
+    def schedule(self, comp_type, sched_type, outer, inner, dtype, scope):
+        """Generate schedule."""
+        sch = tir.Schedule(compute(comp_type, outer, inner, dtype))
+
+        compute_block = sch.get_block("compute")
+        i, _ = sch.get_loops(compute_block)
+
+        if "read" in sched_type:
+            cache_read_a = sch.cache_read(compute_block, 0, scope)
+            sch.compute_at(cache_read_a, i)
+
+            if "multi_input" in comp_type:
+                cache_read_b = sch.cache_read(compute_block, 1, scope)
+                sch.compute_at(cache_read_b, i)
+
+        if "write" in sched_type:
+            cache_write_out = sch.cache_write(compute_block, 0, scope)
+            sch.reverse_compute_at(cache_write_out, i)
+
+        if "read" in sched_type and "write" in sched_type:
+            if comp_type == "single_input":
+                sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_async_stages", [0, 2])
+            elif comp_type == "multi_input_sameQ":
+                sch.annotate(i, "software_pipeline_stage", [0, 0, 1, 2])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
+                sch.annotate(i, "software_pipeline_async_stages", [0, 2])
+            elif comp_type == "multi_input_diffQ":
+                sch.annotate(i, "software_pipeline_stage", [0, 1, 2, 3])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2, 3])
+                sch.annotate(i, "software_pipeline_async_stages", [0, 1, 2])
+
+        elif "read" in sched_type:
+            if comp_type == "single_input":
+                sch.annotate(i, "software_pipeline_stage", [0, 1])
+                sch.annotate(i, "software_pipeline_order", [0, 1])
+                sch.annotate(i, "software_pipeline_async_stages", [0])
+            elif comp_type == "multi_input_sameQ":
+                sch.annotate(i, "software_pipeline_stage", [0, 0, 1])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_async_stages", [0])
+            elif comp_type == "multi_input_diffQ":
+                sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_order", [0, 1, 2])
+                sch.annotate(i, "software_pipeline_async_stages", [0, 1])
+
+        elif "write" in sched_type:
             sch.annotate(i, "software_pipeline_stage", [0, 1])
             sch.annotate(i, "software_pipeline_order", [0, 1])
-            sch.annotate(i, "software_pipeline_async_stages", [0])
-        elif comp_type == "multi_input_sameQ":
-            sch.annotate(i, "software_pipeline_stage", [0, 0, 1])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_async_stages", [0])
-        elif comp_type == "multi_input_diffQ":
-            sch.annotate(i, "software_pipeline_stage", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_order", [0, 1, 2])
-            sch.annotate(i, "software_pipeline_async_stages", [0, 1])
-
-    elif "write" in sched_type:
-        sch.annotate(i, "software_pipeline_stage", [0, 1])
-        sch.annotate(i, "software_pipeline_order", [0, 1])
-        sch.annotate(i, "software_pipeline_async_stages", [1])
-
-    return sch
-
-
-@tvm.testing.fixture
-def verify(dtype):
-    def check(out, ref):
-        if "int" in dtype:
-            np.testing.assert_equal(out.numpy(), ref)
-        else:
-            np.testing.assert_allclose(out.numpy(), ref, rtol=1e-3, atol=1e-3)
-
-    return check
+            sch.annotate(i, "software_pipeline_async_stages", [1])
 
+        return sch
 
-@tvm.testing.requires_hexagon
-def test_async_software_pipeline(hexagon_launcher, comp_type, data, reference, schedule, verify):
-    out_np = data[0]
-    a_np = data[1]
-    if comp_type == "single_input":
-        ref = reference(a_np)
-    else:
-        b_np = data[2]
-        ref = reference(a_np, b_np)
-
-    with tvm.transform.PassContext(
-        config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False}
+    @tvm.testing.requires_hexagon
+    def test_async_software_pipeline(
+        self, hexagon_launcher, comp_type, data, reference, schedule, verify
     ):
-        # tvm.lower(schedule.mod["main"]).show()
-        func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68"))
-
-    with hexagon_launcher.create_session() as hexagon_session:
-        dev = hexagon_session.device
-        mod = hexagon_session.load_module(func)
-        out = tvm.nd.array(out_np, device=dev)
-        a = tvm.nd.array(a_np, device=dev)
+        """Async software pipeline test."""
+        out_np = data[0]
+        a_np = data[1]
         if comp_type == "single_input":
-            mod(a, out)
+            ref = reference(a_np)
         else:
-            b = tvm.nd.array(b_np, device=dev)
-            mod(a, b, out)
+            b_np = data[2]
+            ref = reference(a_np, b_np)
+
+        with tvm.transform.PassContext(
+            config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": False}
+        ):
+            # tvm.lower(schedule.mod["main"]).show()
+            func = tvm.build(schedule.mod["main"], target=get_hexagon_target("v68"))
+
+        with hexagon_launcher.create_session() as hexagon_session:
+            dev = hexagon_session.device
+            mod = hexagon_session.load_module(func)
+            out = tvm.nd.array(out_np, device=dev)
+            a = tvm.nd.array(a_np, device=dev)
+            if comp_type == "single_input":
+                mod(a, out)
+            else:
+                b = tvm.nd.array(b_np, device=dev)
+                mod(a, b, out)
 
-        verify(out, ref)
+            verify(out, ref)
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 307d3a96bf15..980ac0cf4c2a 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -27,47 +27,56 @@
 
 MB = 1024**2
 KB = 1024
-TEST_OUTPUT_TEMPLATE = "Test bandwidth with buffer size {}MB... \n    -Base: {} GBps \n    -Vectorized: {} GBps\n    -Vectorized and Parallelized: {} GBps\n    -Single DMA Copy: {} GBps\n"
+TEST_OUTPUT_TEMPLATE = (
+    "Test bandwidth with buffer size {}MB... \n"
+    "    -Base: {} GBps \n    -Vectorized: {} GBps\n"
+    "    -Vectorized and Parallelized: {} GBps\n"
+    "    -Single DMA Copy: {} GBps\n"
+)
 
 
 def memcopy_operator(size):
+    """Generate memory copy operator."""
+
     @T.prim_func
     def operator(a: T.handle, a_v: T.handle) -> None:
-        A = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
-        A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
+        a_buffer = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
+        a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
         for ax0 in T.serial(size):
             with T.block("A_global.vtcm"):
-                v0 = T.axis.spatial(size, ax0)
-                T.reads(A[v0])
-                T.writes(A_global_vtcm[v0])
-                A_global_vtcm[v0] = A[v0]
+                v0_ind = T.axis.spatial(size, ax0)
+                T.reads(a_buffer[v0_ind])
+                T.writes(a_global_vtcm[v0_ind])
+                a_global_vtcm[v0_ind] = a_buffer[v0_ind]
 
     return operator
 
 
 def single_dma_operator(size):
+    """Generate single dma operator."""
+
     @T.prim_func
     def operator(a: T.handle, a_v: T.handle) -> None:
-        A = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
-        A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
+        a_buffer = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
+        a_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
         T.evaluate(
             T.tvm_call_packed(
                 "device_api.hexagon.mem_copy_DLTensor",
                 T.tvm_stack_make_array(
-                    A_global_vtcm.data,
+                    a_global_vtcm.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    A_global_vtcm.dtype,
+                    a_global_vtcm.dtype,
                     0,
                     dtype="handle",
                 ),
                 T.tvm_stack_make_array(
-                    A.data,
+                    a_buffer.data,
                     T.tvm_stack_make_shape(size, dtype="handle"),
                     0,
                     1,
-                    A.dtype,
+                    a_buffer.dtype,
                     0,
                     dtype="handle",
                 ),
@@ -80,6 +89,7 @@ def operator(a: T.handle, a_v: T.handle) -> None:
 
 
 def evaluate(hexagon_session, sch, size):
+    """Evaluate schedule."""
     a_shape = size
 
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
@@ -110,6 +120,7 @@ def evaluate(hexagon_session, sch, size):
 
 
 class TestMatMulVec:
+    """MatMul test class."""
 
     # Removed most of these to speedup CI.
     size = tvm.testing.parameter(
@@ -133,7 +144,7 @@ class TestMatMulVec:
 
     @tvm.testing.requires_hexagon
     def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split):
-
+        """Test bandwidth."""
         # Run the base memcopy operator.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         base_gpbs = evaluate(hexagon_session, sch, size)
@@ -141,8 +152,8 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         # Run with some basic unroll and vectorize scheduling.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         vtcm_block_a = sch.get_block("A_global.vtcm")
-        vb = sch.get_loops(vtcm_block_a)
-        vbi_a, vio_a, vii_a = sch.split(vb[0], factors=[None, unroll_split, vector_split])
+        v_block = sch.get_loops(vtcm_block_a)
+        _, vio_a, vii_a = sch.split(v_block[0], factors=[None, unroll_split, vector_split])
         sch.unroll(vio_a)
         sch.vectorize(vii_a)
         vectorize_gbps = evaluate(hexagon_session, sch, size)
@@ -150,9 +161,9 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         # Run with some basic unroll and vectorize scheduling and parallelization.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         vtcm_block_a = sch.get_block("A_global.vtcm")
-        vb = sch.get_loops(vtcm_block_a)
-        vbo_a, vbi_a, vio_a, vii_a = sch.split(
-            vb[0], factors=[outer_split, None, unroll_split, vector_split]
+        v_block = sch.get_loops(vtcm_block_a)
+        vbo_a, _, vio_a, vii_a = sch.split(
+            v_block[0], factors=[outer_split, None, unroll_split, vector_split]
         )
         sch.unroll(vio_a)
         sch.vectorize(vii_a)
@@ -169,3 +180,7 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
                 mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps
             )
         )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
index 2fc607c0c521..e4edf2919a00 100644
--- a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
+++ b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
@@ -14,8 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""No QNN canonicalization tests."""
 
-import pytest
 import numpy as np
 
 import tvm.testing
@@ -27,6 +27,7 @@
 
 @tvm.testing.requires_hexagon
 def test_no_qnn_pass():
+    """No QNN pass test."""
     x = relay.var("x", shape=(4, 8), dtype="float32")
     op0 = relay.qnn.op.quantize(x, relay.const(2.0), relay.const(10), out_dtype="uint8")
     op1 = relay.qnn.op.dequantize(op0, relay.const(0.5), relay.const(5))
@@ -61,6 +62,7 @@ def execute(executor, data_np, weight_np, bias_np=None):
 
 @tvm.testing.requires_hexagon
 def test_qnn_conv2d_rq(hexagon_session: Session):
+    """QNN conv2d test."""
     data_shape = [1, 8, 32, 32]
     weight_shape = [16, 8, 3, 3]
     data = relay.var("data", shape=data_shape, dtype="float32")
@@ -119,6 +121,7 @@ def test_qnn_conv2d_rq(hexagon_session: Session):
 
 @tvm.testing.requires_hexagon
 def test_qnn_dense_bias_rq(hexagon_session: Session):
+    """QNN dense with bias test."""
     data_shape = [8, 8]
     weight_shape = [16, 8]
     bias_shape = [16]

From fe11aec3990d4f314f5a56c1ee7cd7e02dce0761 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 28 Oct 2022 22:46:01 +0000
Subject: [PATCH 2/3] fix error

---
 .../test_hexagon/test_async_dma_pipeline.py       |  4 ++++
 .../contrib/test_hexagon/test_cache_read_write.py |  4 ++++
 .../contrib/test_hexagon/test_meta_schedule.py    | 15 +++++++++------
 .../test_hexagon/test_parallel_hvx_load_vtcm.py   |  4 ++++
 .../contrib/test_hexagon/test_run_unit_tests.py   |  4 ++++
 5 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index c5830387160b..a7a05c2aa3a7 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -823,3 +823,7 @@ def test_meta(hexagon_session):
             "pipeline_runtime": pipeline_runtime,
         },
     )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
index af5e7a398870..3ac297fd80d8 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -220,3 +220,7 @@ def _visit(stmt):
         "AllocateNode found in lowered IRModule, "
         "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope"
     )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index 9f4fd0768c50..a83a3b279a7f 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -46,11 +46,10 @@
 class MatmulModule:
     """Matmultest class"""
 
+    # pylint: disable=no-self-argument
     @T.prim_func
-    def main(  # type: ignore  # pylint: disable=no-self-argument
-        a: T.handle, b: T.handle, c: T.handle
-    ) -> None:
-        """main function"""
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # type: ignore
+        # pylint: disable=missing-function-docstring
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         a_buffer = T.match_buffer(a, (16, 16), "float32")
         b_buffer = T.match_buffer(b, (16, 16), "float32")
@@ -243,7 +242,7 @@ def main(  # type: ignore
         packed_width: T.Buffer[(24, 192, 32, 4), "uint8"],  # type: ignore
         compute: T.Buffer[(128, 768), "int32"],  # type: ignore
     ) -> None:
-        """Main function."""
+        # pylint: disable=missing-function-docstring
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         for i0_0_i1_0_0_fused in T.parallel(
             512, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}
@@ -292,7 +291,7 @@ def main(  # type: ignore
                         b_buffer[0, 0:128], dtype="int32x32"
                     )  # type: ignore
                     c_buffer[0:32] = T.call_llvm_pure_intrin(  # type: ignore
-                        4390, T.uint32(3), C[0:32], b_i32x32, a_i32, dtype="int32x32"
+                        4390, T.uint32(3), c_buffer[0:32], b_i32x32, a_i32, dtype="int32x32"
                     )
 
 
@@ -495,3 +494,7 @@ def test_dense_relay_auto_schedule(hexagon_launcher):
         # Fairly loose check since fp16 results between x86 and Hexagon have
         # non-trivial difference.
         assert np.mean(np.abs(ref - out)) < 0.1
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index f720f67ea354..ebcea50e0bae 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -576,3 +576,7 @@ def test_loading_vtcm_for_vrmpy(
                 preloaded_runtime,
             )
         )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
index 24c9f33a8ecb..cd4e5c9b0d66 100644
--- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py
+++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
@@ -48,3 +48,7 @@ def test_run_unit_tests(hexagon_session: Session, gtest_args):
         raise RuntimeError(
             f"Hexagon gtest retruned non-zero error code = {gtest_error_code}:\n{gtest_output}"
         )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From f24020e6be2fe7989d58e656e8c95b399f989b13 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 2 Nov 2022 07:17:25 -0700
Subject: [PATCH 3/3] Fix buffer name

---
 .../test_parallel_hvx_load_vtcm.py            | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index ebcea50e0bae..fb398f43977a 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -166,11 +166,11 @@ def operator(
             c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
         )
         for n, i in T.grid(operations, 128):
-            with T.block("A_global.vtcm"):
+            with T.block("a_buffer_global.vtcm"):
                 vn_ind, vi_index = T.axis.remap("SS", [n, i])
                 a_global_vtcm[vn_ind * 128 + vi_index] = a_buffer[vn_ind, vi_index]
         for n, i in T.grid(operations, 128):
-            with T.block("B_global.vtcm"):
+            with T.block("b_buffer_global.vtcm"):
                 vn_ind, vi_index = T.axis.remap("SS", [n, i])
                 b_global_vtcm[vn_ind * 128 + vi_index] = b_buffer[vn_ind, vi_index]
         for n in T.grid(operations):
@@ -190,7 +190,7 @@ def operator(
                     dtype="int32x32",
                 )
         for n, i in T.grid(operations, 32):
-            with T.block("C_global.vtcm"):
+            with T.block("c_buffer_global.vtcm"):
                 vn_ind, vi_index = T.axis.remap("SS", [n, i])
                 c_buffer[vn_ind, vi_index] = c_global_vtcm[vn_ind * 32 + vi_index]
 
@@ -328,7 +328,7 @@ def evaluate_result(operations, tag, time, result, expected_output):
     gops = round(operations * 128 * 3 / time.mean / 1e9, 3)
     mean_ms = round(time.mean * 1000, 6)
 
-    print("\ntest_{}MB_{} took {} ms @ GOPS: {}".format(transfer_mb, tag, mean_ms, gops))
+    print(f"\ntest_{transfer_mb}MB_{tag} took {mean_ms} ms @ GOPS: {gops}")
     tvm.testing.assert_allclose(result, expected_output)
 
 
@@ -469,12 +469,12 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_vrmpy_parallelization(sch)
         sch = apply_unroll_vectorize(
             sch,
-            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")],
             unroll_split,
             vector_split,
         )
         sch = apply_unroll_vectorize(
-            sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split_unallocated
+            sch, [sch.get_block("c_buffer_global.vtcm")], unroll_split, c_vector_split_unallocated
         )
         vectorized_runtime, result = setup_and_run(
             hexagon_session, sch, input_a, input_b, input_c, operations
@@ -487,14 +487,14 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_vrmpy_parallelization(sch)
         sch = apply_parallel_unroll_vectorize(
             sch,
-            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")],
             outer_split,
             unroll_split,
             vector_split,
         )
         sch = apply_parallel_unroll_vectorize(
             sch,
-            [sch.get_block("C_global.vtcm")],
+            [sch.get_block("c_buffer_global.vtcm")],
             outer_split,
             unroll_split,
             c_vector_split_unallocated,
@@ -509,12 +509,12 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_vrmpy_parallelization(sch)
         sch = apply_unroll_vectorize(
             sch,
-            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")],
             unroll_split,
             vector_split,
         )
         sch = apply_unroll_vectorize(
-            sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split
+            sch, [sch.get_block("c_buffer_global.vtcm")], unroll_split, c_vector_split
         )
         preallocated_vectorized_runtime, result = setup_and_run_preallocated(
             hexagon_session, sch, input_a, input_b, input_c, operations
@@ -527,13 +527,13 @@ def test_loading_vtcm_for_vrmpy(
         sch = apply_vrmpy_parallelization(sch)
         sch = apply_parallel_unroll_vectorize(
             sch,
-            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            [sch.get_block("a_buffer_global.vtcm"), sch.get_block("b_buffer_global.vtcm")],
             outer_split,
             unroll_split,
             vector_split,
         )
         sch = apply_parallel_unroll_vectorize(
-            sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split
+            sch, [sch.get_block("c_buffer_global.vtcm")], outer_split, unroll_split, c_vector_split
         )
         prealloc_vector_parallelized, result = setup_and_run_preallocated(
             hexagon_session, sch, input_a, input_b, input_c, operations